88// option. This file may not be copied, modified, or distributed
99// except according to those terms.
1010
11- #![ feature( plugin) ]
12-
13- #![ allow( unstable) ]
11+ #![ feature( plugin, rustc_private, str_char, collections) ]
1412
1513extern crate syntax;
1614extern crate rustc;
@@ -19,14 +17,18 @@ extern crate rustc;
1917extern crate log;
2018
2119use std:: collections:: HashMap ;
22- use std:: io:: File ;
20+ use std:: env;
21+ use std:: fs:: File ;
22+ use std:: io:: { BufRead , Read } ;
23+ use std:: path:: Path ;
2324
2425use syntax:: parse;
2526use syntax:: parse:: lexer;
2627use rustc:: session:: { self , config} ;
2728
2829use syntax:: ast;
2930use syntax:: ast:: Name ;
31+ use syntax:: codemap;
3032use syntax:: codemap:: Pos ;
3133use syntax:: parse:: token;
3234use syntax:: parse:: lexer:: TokenAndSpan ;
@@ -108,6 +110,7 @@ fn parse_token_list(file: &str) -> HashMap<String, token::Token> {
108110 "LIT_BINARY" => token:: Literal ( token:: Binary ( Name ( 0 ) ) , None ) ,
109111 "LIT_BINARY_RAW" => token:: Literal ( token:: BinaryRaw ( Name ( 0 ) , 0 ) , None ) ,
110112 "QUESTION" => token:: Question ,
113+ "SHEBANG" => token:: Shebang ( Name ( 0 ) ) ,
111114 _ => continue ,
112115 } ;
113116
@@ -166,24 +169,26 @@ fn count(lit: &str) -> usize {
166169 lit. chars ( ) . take_while ( |c| * c == '#' ) . count ( )
167170}
168171
169- fn parse_antlr_token ( s : & str , tokens : & HashMap < String , token:: Token > , surrogate_pairs_pos : & [ usize ] )
172+ fn parse_antlr_token ( s : & str , tokens : & HashMap < String , token:: Token > , surrogate_pairs_pos : & [ usize ] ,
173+ has_bom : bool )
170174 -> TokenAndSpan {
171175 // old regex:
172176 // \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]
173- let start = s. find_str ( "[@" ) . unwrap ( ) ;
174- let comma = start + s[ start..] . find_str ( "," ) . unwrap ( ) ;
175- let colon = comma + s[ comma..] . find_str ( ":" ) . unwrap ( ) ;
176- let content_start = colon + s[ colon..] . find_str ( "='" ) . unwrap ( ) ;
177- let content_end = content_start + s[ content_start..] . find_str ( "',<" ) . unwrap ( ) ;
178- let toknum_end = content_end + s[ content_end..] . find_str ( ">," ) . unwrap ( ) ;
177+ let start = s. find ( "[@" ) . unwrap ( ) ;
178+ let comma = start + s[ start..] . find ( "," ) . unwrap ( ) ;
179+ let colon = comma + s[ comma..] . find ( ":" ) . unwrap ( ) ;
180+ let content_start = colon + s[ colon..] . find ( "='" ) . unwrap ( ) ;
181+ // Use rfind instead of find, because we don't want to stop at the content
182+ let content_end = content_start + s[ content_start..] . rfind ( "',<" ) . unwrap ( ) ;
183+ let toknum_end = content_end + s[ content_end..] . find ( ">," ) . unwrap ( ) ;
179184
180185 let start = & s[ comma + 1 .. colon] ;
181186 let end = & s[ colon + 1 .. content_start] ;
182187 let content = & s[ content_start + 2 .. content_end] ;
183188 let toknum = & s[ content_end + 3 .. toknum_end] ;
184189
185- let proto_tok = tokens . get ( toknum ) . expect ( format ! ( "didn't find token {:?} in the map" ,
186- toknum) ) ;
190+ let not_found = format ! ( "didn't find token {:?} in the map" , toknum ) ;
191+ let proto_tok = tokens . get ( toknum) . expect ( & not_found [ .. ] ) ;
187192
188193 let nm = parse:: token:: intern ( content) ;
189194
@@ -209,24 +214,25 @@ fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_
209214 ref t => t. clone ( )
210215 } ;
211216
212- let offset = if real_tok == token:: Eof
213- {
217+ let start_offset = if real_tok == token:: Eof {
214218 1
215219 } else {
216220 0
217221 } ;
218222
219- let mut lo = start. parse :: < u32 > ( ) . unwrap ( ) - offset;
220- let mut hi = end. parse :: < u32 > ( ) . unwrap ( ) + 1 ;
223+ let offset = if has_bom { 1 } else { 0 } ;
224+
225+ let mut lo = start. parse :: < u32 > ( ) . unwrap ( ) - start_offset - offset;
226+ let mut hi = end. parse :: < u32 > ( ) . unwrap ( ) + 1 - offset;
221227
222228 // Adjust the span: For each surrogate pair already encountered, subtract one position.
223229 lo -= surrogate_pairs_pos. binary_search ( & ( lo as usize ) ) . unwrap_or_else ( |x| x) as u32 ;
224230 hi -= surrogate_pairs_pos. binary_search ( & ( hi as usize ) ) . unwrap_or_else ( |x| x) as u32 ;
225231
226- let sp = syntax :: codemap:: Span {
227- lo : syntax :: codemap:: BytePos ( lo) ,
228- hi : syntax :: codemap:: BytePos ( hi) ,
229- expn_id : syntax :: codemap:: NO_EXPANSION
232+ let sp = codemap:: Span {
233+ lo : codemap:: BytePos ( lo) ,
234+ hi : codemap:: BytePos ( hi) ,
235+ expn_id : codemap:: NO_EXPANSION
230236 } ;
231237
232238 TokenAndSpan {
@@ -245,10 +251,10 @@ fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
245251 }
246252}
247253
248- fn span_cmp ( antlr_sp : syntax :: codemap:: Span , rust_sp : syntax :: codemap:: Span , cm : & syntax :: codemap:: CodeMap ) -> bool {
254+ fn span_cmp ( antlr_sp : codemap:: Span , rust_sp : codemap:: Span , cm : & codemap:: CodeMap ) -> bool {
249255 antlr_sp. expn_id == rust_sp. expn_id &&
250- antlr_sp. lo . to_uint ( ) == cm. bytepos_to_file_charpos ( rust_sp. lo ) . to_uint ( ) &&
251- antlr_sp. hi . to_uint ( ) == cm. bytepos_to_file_charpos ( rust_sp. hi ) . to_uint ( )
256+ antlr_sp. lo . to_usize ( ) == cm. bytepos_to_file_charpos ( rust_sp. lo ) . to_usize ( ) &&
257+ antlr_sp. hi . to_usize ( ) == cm. bytepos_to_file_charpos ( rust_sp. hi ) . to_usize ( )
252258}
253259
254260fn main ( ) {
@@ -257,10 +263,15 @@ fn main() {
257263 r. next_token ( )
258264 }
259265
260- let args = std:: os:: args ( ) ;
266+ let mut args = env:: args ( ) . skip ( 1 ) ;
267+ let filename = args. next ( ) . unwrap ( ) ;
268+ if filename. find ( "parse-fail" ) . is_some ( ) {
269+ return ;
270+ }
261271
262272 // Rust's lexer
263- let code = File :: open ( & Path :: new ( args[ 1 ] ) ) . unwrap ( ) . read_to_string ( ) . unwrap ( ) ;
273+ let mut code = String :: new ( ) ;
274+ File :: open ( & Path :: new ( & filename) ) . unwrap ( ) . read_to_string ( & mut code) . unwrap ( ) ;
264275
265276 let surrogate_pairs_pos: Vec < usize > = code. chars ( ) . enumerate ( )
266277 . filter ( |& ( _, c) | c as usize > 0xFFFF )
@@ -269,6 +280,8 @@ fn main() {
269280 . map ( |( x, n) | x + n)
270281 . collect ( ) ;
271282
283+ let has_bom = code. starts_with ( "\u{feff} " ) ;
284+
272285 debug ! ( "Pairs: {:?}" , surrogate_pairs_pos) ;
273286
274287 let options = config:: basic_options ( ) ;
@@ -281,15 +294,18 @@ fn main() {
281294 let ref cm = lexer. span_diagnostic . cm ;
282295
283296 // ANTLR
284- let mut token_file = File :: open ( & Path :: new ( args[ 2 ] ) ) ;
285- let token_map = parse_token_list ( token_file. read_to_string ( ) . unwrap ( ) ) ;
297+ let mut token_file = File :: open ( & Path :: new ( & args. next ( ) . unwrap ( ) ) ) . unwrap ( ) ;
298+ let mut token_list = String :: new ( ) ;
299+ token_file. read_to_string ( & mut token_list) . unwrap ( ) ;
300+ let token_map = parse_token_list ( & token_list[ ..] ) ;
286301
287- let mut stdin = std:: io:: stdin ( ) ;
288- let mut lock = stdin. lock ( ) ;
302+ let stdin = std:: io:: stdin ( ) ;
303+ let lock = stdin. lock ( ) ;
289304 let lines = lock. lines ( ) ;
290- let mut antlr_tokens = lines. map ( |l| parse_antlr_token ( l. unwrap ( ) . trim ( ) ,
291- & token_map,
292- & surrogate_pairs_pos[ ] ) ) ;
305+ let antlr_tokens = lines. map ( |l| parse_antlr_token ( l. unwrap ( ) . trim ( ) ,
306+ & token_map,
307+ & surrogate_pairs_pos[ ..] ,
308+ has_bom) ) ;
293309
294310 for antlr_tok in antlr_tokens {
295311 let rustc_tok = next ( & mut lexer) ;
@@ -314,7 +330,7 @@ fn main() {
314330 }
315331 _ => panic!( "{:?} is not {:?}" , antlr_tok, rustc_tok)
316332 } , ) *
317- ref c => assert!( c == & antlr_tok. tok, "{:?} is not {:?}" , rustc_tok , antlr_tok )
333+ ref c => assert!( c == & antlr_tok. tok, "{:?} is not {:?}" , antlr_tok , rustc_tok )
318334 }
319335 )
320336 }
0 commit comments