Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model lexer: Fix remaining issues #24620

Merged
merged 3 commits into from
Apr 21, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/grammar/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ javac *.java
rustc -O verify.rs
for file in ../*/**.rs; do
echo $file;
grun RustLexer tokens -tokens < $file | ./verify $file RustLexer.tokens || break
grun RustLexer tokens -tokens < "$file" | ./verify "$file" RustLexer.tokens || break
done
```

Expand Down
120 changes: 49 additions & 71 deletions src/grammar/RustLexer.g4
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
lexer grammar RustLexer;

@lexer::members {
public boolean is_at(int pos) {
return _input.index() == pos;
}
}


tokens {
EQ, LT, LE, EQEQ, NE, GE, GT, ANDAND, OROR, NOT, TILDE, PLUT,
MINUS, STAR, SLASH, PERCENT, CARET, AND, OR, SHL, SHR, BINOP,
Expand All @@ -8,14 +15,10 @@ tokens {
LBRACE, RBRACE, POUND, DOLLAR, UNDERSCORE, LIT_CHAR,
LIT_INTEGER, LIT_FLOAT, LIT_STR, LIT_STR_RAW, LIT_BINARY,
LIT_BINARY_RAW, IDENT, LIFETIME, WHITESPACE, DOC_COMMENT,
COMMENT
COMMENT, SHEBANG
}

/* Note: due to antlr limitations, we can't represent XID_start and
* XID_continue properly. ASCII-only substitute. */

fragment XID_start : [_a-zA-Z] ;
fragment XID_continue : [_a-zA-Z0-9] ;
import xidstart , xidcontinue;


/* Expression-operator symbols */
Expand Down Expand Up @@ -90,94 +93,63 @@ fragment CHAR_ESCAPE
| [xX] HEXIT HEXIT
| 'u' HEXIT HEXIT HEXIT HEXIT
| 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
| 'u{' HEXIT '}'
| 'u{' HEXIT HEXIT '}'
| 'u{' HEXIT HEXIT HEXIT '}'
| 'u{' HEXIT HEXIT HEXIT HEXIT '}'
| 'u{' HEXIT HEXIT HEXIT HEXIT HEXIT '}'
| 'u{' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT '}'
;

fragment SUFFIX
: IDENT
;

fragment INTEGER_SUFFIX
: { _input.LA(1) != 'e' && _input.LA(1) != 'E' }? SUFFIX
;

LIT_CHAR
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\'' SUFFIX?
: '\'' ( '\\' CHAR_ESCAPE
| ~[\\'\n\t\r]
| '\ud800' .. '\udbff' '\udc00' .. '\udfff'
)
'\'' SUFFIX?
;

LIT_BYTE
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT | [nrt\\'"0] ) | ~[\\'\n\t\r] ) '\'' SUFFIX?
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT
| [nrt\\'"0] )
| ~[\\'\n\t\r] '\udc00'..'\udfff'?
)
'\'' SUFFIX?
;

LIT_INTEGER
: [0-9][0-9_]* SUFFIX?
| '0b' [01][01_]* SUFFIX?
| '0o' [0-7][0-7_]* SUFFIX?
| '0x' [0-9a-fA-F][0-9a-fA-F_]* SUFFIX?

: [0-9][0-9_]* INTEGER_SUFFIX?
| '0b' [01_]+ INTEGER_SUFFIX?
| '0o' [0-7_]+ INTEGER_SUFFIX?
| '0x' [0-9a-fA-F_]+ INTEGER_SUFFIX?
;

LIT_FLOAT
: [0-9][0-9_]* ('.' {
/* dot followed by another dot is a range, no float */
/* dot followed by another dot is a range, not a float */
_input.LA(1) != '.' &&
/* dot followed by an identifier is an integer with a function call, no float */
/* dot followed by an identifier is an integer with a function call, not a float */
_input.LA(1) != '_' &&
_input.LA(1) != 'a' &&
_input.LA(1) != 'b' &&
_input.LA(1) != 'c' &&
_input.LA(1) != 'd' &&
_input.LA(1) != 'e' &&
_input.LA(1) != 'f' &&
_input.LA(1) != 'g' &&
_input.LA(1) != 'h' &&
_input.LA(1) != 'i' &&
_input.LA(1) != 'j' &&
_input.LA(1) != 'k' &&
_input.LA(1) != 'l' &&
_input.LA(1) != 'm' &&
_input.LA(1) != 'n' &&
_input.LA(1) != 'o' &&
_input.LA(1) != 'p' &&
_input.LA(1) != 'q' &&
_input.LA(1) != 'r' &&
_input.LA(1) != 's' &&
_input.LA(1) != 't' &&
_input.LA(1) != 'u' &&
_input.LA(1) != 'v' &&
_input.LA(1) != 'w' &&
_input.LA(1) != 'x' &&
_input.LA(1) != 'y' &&
_input.LA(1) != 'z' &&
_input.LA(1) != 'A' &&
_input.LA(1) != 'B' &&
_input.LA(1) != 'C' &&
_input.LA(1) != 'D' &&
_input.LA(1) != 'E' &&
_input.LA(1) != 'F' &&
_input.LA(1) != 'G' &&
_input.LA(1) != 'H' &&
_input.LA(1) != 'I' &&
_input.LA(1) != 'J' &&
_input.LA(1) != 'K' &&
_input.LA(1) != 'L' &&
_input.LA(1) != 'M' &&
_input.LA(1) != 'N' &&
_input.LA(1) != 'O' &&
_input.LA(1) != 'P' &&
_input.LA(1) != 'Q' &&
_input.LA(1) != 'R' &&
_input.LA(1) != 'S' &&
_input.LA(1) != 'T' &&
_input.LA(1) != 'U' &&
_input.LA(1) != 'V' &&
_input.LA(1) != 'W' &&
_input.LA(1) != 'X' &&
_input.LA(1) != 'Y' &&
_input.LA(1) != 'Z'
!(_input.LA(1) >= 'a' && _input.LA(1) <= 'z') &&
!(_input.LA(1) >= 'A' && _input.LA(1) <= 'Z')
}? | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? SUFFIX?)
;

LIT_STR
: '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"' SUFFIX?
;

LIT_BINARY : 'b' LIT_STR SUFFIX?;
LIT_BINARY_RAW : 'rb' LIT_STR_RAW SUFFIX?;
LIT_BINARY : 'b' LIT_STR ;
LIT_BINARY_RAW : 'b' LIT_STR_RAW ;

/* this is a bit messy */

Expand All @@ -197,21 +169,27 @@ LIT_STR_RAW

QUESTION : '?';

IDENT : XID_start XID_continue* ;
IDENT : XID_Start XID_Continue* ;

fragment QUESTION_IDENTIFIER : QUESTION? IDENT;

LIFETIME : '\'' IDENT ;

WHITESPACE : [ \r\n\t]+ ;

UNDOC_COMMENT : '////' ~[\r\n]* -> type(COMMENT) ;
UNDOC_COMMENT : '////' ~[\n]* -> type(COMMENT) ;
YESDOC_COMMENT : '///' ~[\r\n]* -> type(DOC_COMMENT) ;
OUTER_DOC_COMMENT : '//!' ~[\r\n]* -> type(DOC_COMMENT) ;
LINE_COMMENT : '//' ~[\r\n]* -> type(COMMENT) ;
LINE_COMMENT : '//' ( ~[/\n] ~[\n]* )? -> type(COMMENT) ;

DOC_BLOCK_COMMENT
: ('/**' ~[*] | '/*!') (DOC_BLOCK_COMMENT | .)*? '*/' -> type(DOC_COMMENT)
;

BLOCK_COMMENT : '/*' (BLOCK_COMMENT | .)*? '*/' -> type(COMMENT) ;

/* these appear at the beginning of a file */

SHEBANG : '#!' { is_at(2) && _input.LA(1) != '[' }? ~[\r\n]* -> type(SHEBANG) ;

UTF8_BOM : '\ufeff' { is_at(1) }? -> skip ;
8 changes: 4 additions & 4 deletions src/grammar/check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ failed=0
skipped=0

check() {
grep --silent "// ignore-lexer-test" $1;
grep --silent "// ignore-lexer-test" "$1";

# if it's *not* found...
if [ $? -eq 1 ]; then
cd $2 # This `cd` is so java will pick up RustLexer.class. I couldn't
# figure out how to wrangle the CLASSPATH, just adding build/grammr didn't
# seem to have anny effect.
# figure out how to wrangle the CLASSPATH, just adding build/grammar
# didn't seem to have any effect.
if $3 RustLexer tokens -tokens < $1 | $4 $1 $5; then
echo "pass: $1"
passed=`expr $passed + 1`
Expand All @@ -39,7 +39,7 @@ check() {
}

for file in $(find $1 -iname '*.rs' ! -path '*/test/compile-fail*'); do
check $file $2 $3 $4 $5
check "$file" $2 $3 $4 $5
done

printf "\ntest result: "
Expand Down
Loading