11lexer grammar RustLexer;
22
3+ @lexer::members {
4+ public boolean is_at(int pos) {
5+ return _input.index() == pos;
6+ }
7+ }
8+
9+
310tokens {
411 EQ, LT, LE, EQEQ, NE, GE, GT, ANDAND, OROR, NOT, TILDE, PLUT,
512 MINUS, STAR, SLASH, PERCENT, CARET, AND, OR, SHL, SHR, BINOP,
@@ -8,14 +15,10 @@ tokens {
815 LBRACE, RBRACE, POUND, DOLLAR, UNDERSCORE, LIT_CHAR,
916 LIT_INTEGER, LIT_FLOAT, LIT_STR, LIT_STR_RAW, LIT_BINARY,
1017 LIT_BINARY_RAW, IDENT, LIFETIME, WHITESPACE, DOC_COMMENT,
11- COMMENT
18+ COMMENT, SHEBANG
1219}
1320
14- /* Note: due to antlr limitations, we can't represent XID_start and
15- * XID_continue properly. ASCII-only substitute. */
16-
17- fragment XID_start : [_a-zA-Z ] ;
18- fragment XID_continue : [_a-zA-Z0 -9] ;
21+ import xidstart , xidcontinue;
1922
2023
2124/* Expression-operator symbols */
@@ -90,94 +93,63 @@ fragment CHAR_ESCAPE
9093 | [xX] HEXIT HEXIT
9194 | ' u' HEXIT HEXIT HEXIT HEXIT
9295 | ' U ' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
96+ | ' u{' HEXIT ' } '
97+ | ' u{' HEXIT HEXIT ' } '
98+ | ' u{' HEXIT HEXIT HEXIT ' } '
99+ | ' u{' HEXIT HEXIT HEXIT HEXIT ' } '
100+ | ' u{' HEXIT HEXIT HEXIT HEXIT HEXIT ' } '
101+ | ' u{' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT ' } '
93102 ;
94103
95104fragment SUFFIX
96105 : IDENT
97106 ;
98107
108+ fragment INTEGER_SUFFIX
109+ : { _input.LA(1) != ' e' && _input.LA(1) != ' E ' }? SUFFIX
110+ ;
111+
99112LIT_CHAR
100- : ' \' ' ( ' \\ ' CHAR_ESCAPE | ~[\\' \n\t\r ] ) ' \' ' SUFFIX ?
113+ : ' \' ' ( ' \\ ' CHAR_ESCAPE
114+ | ~[\\' \n\t\r ]
115+ | ' \ud800' .. ' \udbff' ' \udc00' .. ' \udfff'
116+ )
117+ ' \' ' SUFFIX ?
101118 ;
102119
103120LIT_BYTE
104- : ' b\' ' ( ' \\ ' ( [xX] HEXIT HEXIT | [nrt\\' "0] ) | ~[\\ ' \n\t\r] ) ' \' ' SUFFIX ?
121+ : ' b\' ' ( ' \\ ' ( [xX] HEXIT HEXIT
122+ | [nrt\\' "0] )
123+ | ~[\\ ' \n\t\r] ' \udc00 ' ..' \udfff ' ?
124+ )
125+ ' \' ' SUFFIX ?
105126 ;
106127
107128LIT_INTEGER
108- : [0-9][0-9_]* SUFFIX ?
109- | ' 0b' [01][01_]* SUFFIX ?
110- | ' 0o' [0-7][0-7_]* SUFFIX ?
111- | ' 0x' [0-9a-fA-F ][0-9a-fA-F_ ]* SUFFIX ?
129+
130+ : [0-9][0-9_]* INTEGER_SUFFIX ?
131+ | ' 0b' [01_]+ INTEGER_SUFFIX ?
132+ | ' 0o' [0-7_]+ INTEGER_SUFFIX ?
133+ | ' 0x' [0-9a-fA-F_ ]+ INTEGER_SUFFIX ?
112134 ;
113135
114136LIT_FLOAT
115137 : [0-9][0-9_]* (' .' {
116- /* dot followed by another dot is a range, no float */
138+ /* dot followed by another dot is a range, not a float */
117139 _input.LA(1 ) != ' .' &&
118- /* dot followed by an identifier is an integer with a function call, no float */
140+ /* dot followed by an identifier is an integer with a function call, not a float */
119141 _input.LA(1 ) != ' _' &&
120- _input.LA(1 ) != ' a' &&
121- _input.LA(1 ) != ' b' &&
122- _input.LA(1 ) != ' c' &&
123- _input.LA(1 ) != ' d' &&
124- _input.LA(1 ) != ' e' &&
125- _input.LA(1 ) != ' f' &&
126- _input.LA(1 ) != ' g' &&
127- _input.LA(1 ) != ' h' &&
128- _input.LA(1 ) != ' i' &&
129- _input.LA(1 ) != ' j' &&
130- _input.LA(1 ) != ' k' &&
131- _input.LA(1 ) != ' l' &&
132- _input.LA(1 ) != ' m' &&
133- _input.LA(1 ) != ' n' &&
134- _input.LA(1 ) != ' o' &&
135- _input.LA(1 ) != ' p' &&
136- _input.LA(1 ) != ' q' &&
137- _input.LA(1 ) != ' r' &&
138- _input.LA(1 ) != ' s' &&
139- _input.LA(1 ) != ' t' &&
140- _input.LA(1 ) != ' u' &&
141- _input.LA(1 ) != ' v' &&
142- _input.LA(1 ) != ' w' &&
143- _input.LA(1 ) != ' x' &&
144- _input.LA(1 ) != ' y' &&
145- _input.LA(1 ) != ' z' &&
146- _input.LA(1 ) != ' A' &&
147- _input.LA(1 ) != ' B' &&
148- _input.LA(1 ) != ' C' &&
149- _input.LA(1 ) != ' D' &&
150- _input.LA(1 ) != ' E' &&
151- _input.LA(1 ) != ' F' &&
152- _input.LA(1 ) != ' G' &&
153- _input.LA(1 ) != ' H' &&
154- _input.LA(1 ) != ' I' &&
155- _input.LA(1 ) != ' J' &&
156- _input.LA(1 ) != ' K' &&
157- _input.LA(1 ) != ' L' &&
158- _input.LA(1 ) != ' M' &&
159- _input.LA(1 ) != ' N' &&
160- _input.LA(1 ) != ' O' &&
161- _input.LA(1 ) != ' P' &&
162- _input.LA(1 ) != ' Q' &&
163- _input.LA(1 ) != ' R' &&
164- _input.LA(1 ) != ' S' &&
165- _input.LA(1 ) != ' T' &&
166- _input.LA(1 ) != ' U' &&
167- _input.LA(1 ) != ' V' &&
168- _input.LA(1 ) != ' W' &&
169- _input.LA(1 ) != ' X' &&
170- _input.LA(1 ) != ' Y' &&
171- _input.LA(1 ) != ' Z'
142+ !(_input.LA(1 ) >= ' a' && _input.LA(1 ) <= ' z' ) &&
143+ !(_input.LA(1 ) >= ' A' && _input.LA(1 ) <= ' Z' )
172144 } ? | (' .' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? SUFFIX ?)
173145 ;
174146
175147LIT_STR
176148 : ' "' (' \\\n ' | ' \\\r\n ' | ' \\ ' CHAR_ESCAPE | .)*? ' "' SUFFIX ?
177149 ;
178150
179- LIT_BINARY : ' b' LIT_STR SUFFIX ? ;
180- LIT_BINARY_RAW : ' rb ' LIT_STR_RAW SUFFIX ? ;
151+ LIT_BINARY : ' b' LIT_STR ;
152+ LIT_BINARY_RAW : ' b ' LIT_STR_RAW ;
181153
182154/* this is a bit messy */
183155
@@ -197,21 +169,27 @@ LIT_STR_RAW
197169
198170QUESTION : ' ?' ;
199171
200- IDENT : XID_start XID_continue * ;
172+ IDENT : XID_Start XID_Continue * ;
201173
202174fragment QUESTION_IDENTIFIER : QUESTION ? IDENT ;
203175
204176LIFETIME : ' \' ' IDENT ;
205177
206178WHITESPACE : [ \r\n\t]+ ;
207179
208- UNDOC_COMMENT : ' ////' ~[\r\ n]* -> type(COMMENT ) ;
180+ UNDOC_COMMENT : ' ////' ~[\n]* -> type(COMMENT ) ;
209181YESDOC_COMMENT : ' ///' ~[\r\n]* -> type(DOC_COMMENT ) ;
210182OUTER_DOC_COMMENT : ' //!' ~[\r\n]* -> type(DOC_COMMENT ) ;
211- LINE_COMMENT : ' //' ~[\r \n]* -> type(COMMENT ) ;
183+ LINE_COMMENT : ' //' ( ~[/\n] ~[ \n]* )? -> type(COMMENT ) ;
212184
213185DOC_BLOCK_COMMENT
214186 : (' /**' ~[*] | ' /*!' ) (DOC_BLOCK_COMMENT | .)*? ' */' -> type(DOC_COMMENT )
215187 ;
216188
217189BLOCK_COMMENT : ' /*' (BLOCK_COMMENT | .)*? ' */' -> type(COMMENT ) ;
190+
191+ /* these appear at the beginning of a file */
192+
193+ SHEBANG : ' #!' { is_at(2 ) && _input.LA(1 ) != ' [' } ? ~[\r\n]* -> type(SHEBANG ) ;
194+
195+ UTF8_BOM : ' \ufeff ' { is_at(1 ) } ? -> skip ;
0 commit comments