-
Notifications
You must be signed in to change notification settings - Fork 1
Lexer skeleton #5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
82dda1b
5e8b808
f71dcd9
3d5cc2a
febcfcf
22d78d2
f11703e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
// This file is part of rust-web/twig | ||
// | ||
// For the copyright and license information, please view the LICENSE | ||
// file that was distributed with this source code. | ||
|
||
mod syntax; | ||
|
||
pub use self::syntax::{ SyntaxError, SyntaxErrorCode, UnexpectedToken }; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
// This file is part of rust-web/twig | ||
// | ||
// For the copyright and license information, please view the LICENSE | ||
// file that was distributed with this source code. | ||
|
||
use api::Position; | ||
use api::tokens::{ Token }; | ||
|
||
#[derive(Debug)] | ||
pub struct SyntaxError { | ||
pub code: SyntaxErrorCode, | ||
pub starts_at: Position, | ||
pub ends_at: Option<Position>, | ||
} | ||
|
||
impl SyntaxError { | ||
/// Specify the location where the error ends. | ||
pub fn ends_at(mut self, pos: Position) -> SyntaxError { | ||
self.ends_at = Some(pos); | ||
self | ||
} | ||
} | ||
|
||
#[derive(Debug)] | ||
pub enum SyntaxErrorCode { | ||
ExpectedTokenButReceived { expected: Token, received: UnexpectedToken }, | ||
} | ||
|
||
impl SyntaxErrorCode { | ||
/// Specify the location where the error starts. | ||
pub fn starts_at(self, pos: Position) -> SyntaxError { | ||
SyntaxError { | ||
code: self, | ||
starts_at: pos, | ||
ends_at: None, | ||
} | ||
} | ||
} | ||
|
||
#[derive(Clone, Debug)] | ||
pub enum UnexpectedToken { | ||
Token(Token), | ||
EndOfStream, | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
// This file is part of rust-web/twig | ||
// | ||
// For the copyright and license information, please view the LICENSE | ||
// file that was distributed with this source code. | ||
|
||
use std::io; | ||
use api::error::SyntaxError; | ||
use std::result; | ||
|
||
#[derive(Debug)] | ||
pub enum LexingError { | ||
Syntax(SyntaxError), | ||
Io(io::Error) | ||
} | ||
|
||
impl From<io::Error> for LexingError { | ||
fn from(other: io::Error) -> LexingError { | ||
LexingError::Io(other) | ||
} | ||
} | ||
|
||
impl From<SyntaxError> for LexingError { | ||
fn from(other: SyntaxError) -> LexingError { | ||
LexingError::Syntax(other) | ||
} | ||
} | ||
|
||
pub type LexingResult<T> = result::Result<T, LexingError>; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with current error management this should become |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
// This file is part of rust-web/twig | ||
// | ||
// For the copyright and license information, please view the LICENSE | ||
// file that was distributed with this source code. | ||
|
||
use api::Position; | ||
use api::tokens::TokenRef; | ||
|
||
use api::lexer::{ Lexer, LexingResult }; | ||
use api::error::{ SyntaxErrorCode, UnexpectedToken }; | ||
|
||
/// TokenRef wrapper for `Lexer` that additionaly has position. | ||
#[derive(Debug)] | ||
pub struct ItemRef<'t> { | ||
pub token: TokenRef<'t>, | ||
pub position: Position, | ||
} | ||
|
||
/// Lexer token iterator. | ||
/// | ||
/// 'i is iteration lifetime, or "one use of lexer". | ||
/// 't is template lifetime. It will live longer than this iteration. | ||
pub struct Tokens<'i, 't> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
/// Position of the next token to get. | ||
next_pos: Position, // temporary field until I get cursor in. | ||
env: &'i Lexer, | ||
code: &'t str, | ||
} | ||
|
||
impl<'i, 't> Iterator for Tokens<'i, 't> { | ||
type Item = LexingResult<ItemRef<'t>>; | ||
|
||
fn next(&mut self) -> Option<LexingResult<ItemRef<'t>>> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TL;DR at the end ^^ Too bad rust does not have guaranteed tail call elimination. I did some investigations, what we could do instead. First, I analyzed the nesting of sub-calls in my code and twigphp to look for any possibly infinite recursions.
These recursions are separately closed in a sense they can be rewritten individually like 'lex_data: loop {
// do something
if(/**/) { continue 'lex_data /* mimic tail-call */ }
// ..
if(/**/) { break 'lex_data /* or return */ }
// ..
} However there might be another recursion related to interpolation in double-quoted strings. This boils down to "can an expression/variable contain a string that contains a variable that contains another string that .." Here are some details. I denote subcalls by "->", where tail-/non-tail-positions are marked with "*" and "_". No sub-calls: "()".
The good news is: sub-calls in non-tail-positions "_" don't create recursions (they only occur in Data, so this is easy to analyse).
We can't break a loop of the caller from inside a function - even a generic function is not flexible enough - but macros can. For readability I put the macros to the end - but for compilation they must be defined first. 'lex_block: loop {
// ..
lex_expression!("block successfully parsed", break 'lex_block, continue 'lex_block); // not in tail position - but no guarantee needed, because not recursive
continue 'lex_data // equivalent to a tail-call guarantee
// ..
}
// shared code via macro - a bit nasty but it works(tm)
macro_rules! lex_expression {
( $message:expr, $break_parent:stmt, $continue_parent:stmt ) => {
'lex_expression: loop {
// do something
println!($message);
$break_parent;
unreachable!(); // just to check the break_parent stmt really jumps somewhere else
// ...
// we could embedd the lex_string logic completely in this macro, or delegate to a sub-macro
lex_string!("some argument", break 'lex_expression, continue 'lex_expression)
//
}
};
}
// shared code via macro - a bit nasty but it works(tm)
macro_rules! lex_string {
( $message:expr, $break_parent:stmt, $continue_parent:stmt ) => {
'lex_string: loop {
// do something
println!($message);
$continue_parent;
unreachable!(); // just to check the break_statement really jumps somewhere else
// ...
}
};
} I know this macro-thing probably feels a bit weird. But if we want to factor away most function calls in lexer, it seems possible. The only thing that needs a closer look is the possible expression-interpolation-recursion (this is where the stack could still blow up, if we can't reduce it to a flat loop). But at this point even I begin to question if it is really worth it... TL;DRSo finally I arrive at the point to admit "let's just proceed with some enum-based match-branching in a loop - which would include the iterator pattern you are suggesting". ;-) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, but we can definitely do it in a way that does not block future improvements. |
||
// Hello, my name is Lexer. Twig Lexer. | ||
// I am not very complicated. | ||
match self.next_pos { | ||
Position { line: 1, .. } => { | ||
self.next_pos.line = 2; | ||
Some(Ok(ItemRef { token: TokenRef::BlockStart, position: self.next_pos })) | ||
}, | ||
Position { line: 2, .. } => { | ||
self.next_pos.line = 3; | ||
Some(Ok(ItemRef { token: TokenRef::Name("§"), position: self.next_pos })) | ||
}, | ||
_ => None | ||
} | ||
} | ||
} | ||
|
||
impl<'i, 't> Tokens<'i, 't> { | ||
pub fn new<'ii, 'tt>(lexer: &'ii Lexer, code: &'tt str) -> Tokens<'ii, 'tt> { | ||
Tokens { | ||
next_pos: Position { line: 1, column: 1 }, | ||
env: lexer, | ||
code: code, | ||
} | ||
} | ||
|
||
pub fn expect(&mut self, expected: TokenRef<'t>) -> LexingResult<TokenRef<'t>> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in my code I tried to be a bit more abstract ~ like |
||
let maybe_item = self.next(); | ||
match maybe_item { | ||
None => Err( | ||
SyntaxErrorCode::ExpectedTokenButReceived { | ||
expected: expected.into(), | ||
received: UnexpectedToken::EndOfStream | ||
}.starts_at(self.next_pos).into() | ||
), | ||
Some(Ok(item)) => { | ||
if item.token == expected { | ||
Ok(item.token) | ||
} else { | ||
Err( | ||
SyntaxErrorCode::ExpectedTokenButReceived { | ||
expected: expected.into(), | ||
received: UnexpectedToken::Token(item.token.into()) | ||
}.starts_at(item.position).into() | ||
) | ||
} | ||
}, | ||
Some(Err(error)) => Err(error), | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
// This file is part of rust-web/twig | ||
// | ||
// For the copyright and license information, please view the LICENSE | ||
// file that was distributed with this source code. | ||
|
||
/*! | ||
Twig lexer. | ||
|
||
Produces a token stream from source template. | ||
|
||
# Summary | ||
|
||
This module is capable of taking a Twig input template, for example, this one: | ||
|
||
```twig | ||
Hello | ||
{% if world %} | ||
world | ||
{% else %} | ||
{{ other }} | ||
{% endif %} | ||
``` | ||
|
||
And chopping it into tokens like these: | ||
|
||
TODO: Example code for this. | ||
*/ | ||
|
||
mod lexing; | ||
mod error; | ||
|
||
pub use self::lexing::{ Tokens, ItemRef }; | ||
pub use self::error::{ LexingError, LexingResult }; | ||
|
||
#[derive(Copy, Clone)] | ||
pub struct Options; | ||
|
||
impl Options { | ||
pub fn default() -> Options { Options } | ||
} | ||
|
||
pub struct Lexer; | ||
|
||
impl Lexer { | ||
/// Create a new lexer from options and operator list. | ||
pub fn new(options: Options, operators: Vec<&'static str>) -> Lexer { | ||
Lexer | ||
} | ||
|
||
/// Get a lexed stream of tokens. | ||
pub fn tokens<'i, 't>(&'i self, code: &'t str) -> Tokens<'i, 't> { | ||
Tokens::new(self, code) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// This file is part of rust-web/twig | ||
// | ||
// For the copyright and license information, please view the LICENSE | ||
// file that was distributed with this source code. | ||
|
||
/*! | ||
Twig extension writer's API. | ||
*/ | ||
|
||
pub mod tokens; | ||
pub mod lexer; | ||
pub mod error; | ||
|
||
/// Line-column position in a file. | ||
#[derive(Debug, Default, Copy, Clone)] | ||
pub struct Position { | ||
pub line: usize, | ||
pub column: usize, | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
// This file is part of rust-web/twig | ||
// | ||
// For the copyright and license information, please view the LICENSE | ||
// file that was distributed with this source code. | ||
|
||
/*! | ||
Tokens, received from lexer output. | ||
*/ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I read that multi-line comments are not rust-standard - but I can see the motivation to make life easier here. I plan to have a look at rustfmt. Maybe this can help with rust-like formatting and convert things like this? If not now - possibly in the future. :-) Of course this is just reasoning about aesthetics. ^^ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They tried to remove them without success several times - I think at this point it is unlikely to happen :D |
||
|
||
/// Token value. | ||
/// | ||
/// The lifetime of this token refers to original source string which | ||
/// should be kept alive as long as this token. | ||
#[derive(Debug, Copy, Clone, PartialEq)] | ||
pub enum TokenRef<'a> { | ||
BlockStart, | ||
Name(&'a str), | ||
Text(&'a str), | ||
} | ||
|
||
impl<'a> From<TokenRef<'a>> for Token { | ||
/// Get owned value for this token. | ||
fn from<'r>(other: TokenRef<'r>) -> Self { | ||
match other { | ||
TokenRef::BlockStart => Token::BlockStart, | ||
TokenRef::Name(v) => Token::Name(v.into()), | ||
TokenRef::Text(v) => Token::Text(v.into()), | ||
} | ||
} | ||
} | ||
|
||
/// Owned token value. | ||
#[derive(Debug, Clone)] | ||
pub enum Token { | ||
BlockStart, | ||
Name(String), | ||
Text(String), | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,3 +61,4 @@ | |
|
||
#[macro_use] | ||
pub mod error; | ||
pub mod api; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// This file is part of rust-web/twig | ||
// | ||
// For the copyright and license information, please view the LICENSE | ||
// file that was distributed with this source code. | ||
|
||
use twig::api::lexer::{ Lexer, Tokens, Options }; | ||
use twig::api::tokens::TokenRef; | ||
|
||
#[test] | ||
fn name_label_for_tag() { | ||
let template = "{% § %}"; | ||
let lexer = Lexer::new(Options::default(), Vec::new()); | ||
let mut s = lexer.tokens(&template); | ||
|
||
expect(&mut s, TokenRef::BlockStart); | ||
expect(&mut s, TokenRef::Name("§")); | ||
} | ||
|
||
fn expect<'i, 'c>(stream: &mut Tokens<'i, 'c>, token_value: TokenRef<'c>) { | ||
if let Err(e) = stream.expect(token_value) { | ||
panic!("Received error {:?}", e); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
// This file is part of rust-web/twig | ||
// | ||
// For the copyright and license information, please view the LICENSE | ||
// file that was distributed with this source code. | ||
|
||
extern crate twig; | ||
|
||
mod lexer; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok :-)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I decided to raise the bar up to Rust level when it comes to error location display :)