@@ -35,8 +35,8 @@ pub use unicode_xid::UNICODE_VERSION as UNICODE_XID_VERSION;
3535
3636use self :: LiteralKind :: * ;
3737use self :: TokenKind :: * ;
38- pub use crate :: cursor:: Cursor ;
3938use crate :: cursor:: EOF_CHAR ;
39+ pub use crate :: cursor:: { Cursor , FrontmatterAllowed } ;
4040
4141/// Parsed token.
4242/// It doesn't contain information about data that has been parsed,
@@ -57,17 +57,27 @@ impl Token {
5757#[ derive( Clone , Copy , Debug , PartialEq , Eq ) ]
5858pub enum TokenKind {
5959 /// A line comment, e.g. `// comment`.
60- LineComment { doc_style : Option < DocStyle > } ,
60+ LineComment {
61+ doc_style : Option < DocStyle > ,
62+ } ,
6163
6264 /// A block comment, e.g. `/* block comment */`.
6365 ///
6466 /// Block comments can be recursive, so a sequence like `/* /* */`
6567 /// will not be considered terminated and will result in a parsing error.
66- BlockComment { doc_style : Option < DocStyle > , terminated : bool } ,
68+ BlockComment {
69+ doc_style : Option < DocStyle > ,
70+ terminated : bool ,
71+ } ,
6772
6873 /// Any whitespace character sequence.
6974 Whitespace ,
7075
76+ Frontmatter {
77+ has_invalid_preceding_whitespace : bool ,
78+ invalid_infostring : bool ,
79+ } ,
80+
7181 /// An identifier or keyword, e.g. `ident` or `continue`.
7282 Ident ,
7383
@@ -109,10 +119,15 @@ pub enum TokenKind {
109119 /// this type will need to check for and reject that case.
110120 ///
111121 /// See [LiteralKind] for more details.
112- Literal { kind : LiteralKind , suffix_start : u32 } ,
122+ Literal {
123+ kind : LiteralKind ,
124+ suffix_start : u32 ,
125+ } ,
113126
114127 /// A lifetime, e.g. `'a`.
115- Lifetime { starts_with_number : bool } ,
128+ Lifetime {
129+ starts_with_number : bool ,
130+ } ,
116131
117132 /// `;`
118133 Semi ,
@@ -280,7 +295,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
280295#[ inline]
281296pub fn validate_raw_str ( input : & str , prefix_len : u32 ) -> Result < ( ) , RawStrError > {
282297 debug_assert ! ( !input. is_empty( ) ) ;
283- let mut cursor = Cursor :: new ( input) ;
298+ let mut cursor = Cursor :: new ( input, FrontmatterAllowed :: No ) ;
284299 // Move past the leading `r` or `br`.
285300 for _ in 0 ..prefix_len {
286301 cursor. bump ( ) . unwrap ( ) ;
@@ -290,7 +305,7 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
290305
291306/// Creates an iterator that produces tokens from the input string.
292307pub fn tokenize ( input : & str ) -> impl Iterator < Item = Token > {
293- let mut cursor = Cursor :: new ( input) ;
308+ let mut cursor = Cursor :: new ( input, FrontmatterAllowed :: No ) ;
294309 std:: iter:: from_fn ( move || {
295310 let token = cursor. advance_token ( ) ;
296311 if token. kind != TokenKind :: Eof { Some ( token) } else { None }
@@ -361,7 +376,34 @@ impl Cursor<'_> {
361376 Some ( c) => c,
362377 None => return Token :: new ( TokenKind :: Eof , 0 ) ,
363378 } ;
379+
364380 let token_kind = match first_char {
381+ c if matches ! ( self . frontmatter_allowed, FrontmatterAllowed :: Yes )
382+ && is_whitespace ( c) =>
383+ {
384+ let mut last = first_char;
385+ while is_whitespace ( self . first ( ) ) {
386+ let Some ( c) = self . bump ( ) else {
387+ break ;
388+ } ;
389+ last = c;
390+ }
391+ // invalid frontmatter opening as whitespace preceding it isn't newline.
392+ // combine the whitespace and the frontmatter to a single token as we shall
393+ // error later.
394+ if last != '\n' && self . as_str ( ) . starts_with ( "---" ) {
395+ self . bump ( ) ;
396+ self . frontmatter ( true )
397+ } else {
398+ Whitespace
399+ }
400+ }
401+ '-' if matches ! ( self . frontmatter_allowed, FrontmatterAllowed :: Yes )
402+ && self . as_str ( ) . starts_with ( "--" ) =>
403+ {
404+ // happy path
405+ self . frontmatter ( false )
406+ }
365407 // Slash, comment or block comment.
366408 '/' => match self . first ( ) {
367409 '/' => self . line_comment ( ) ,
@@ -464,11 +506,110 @@ impl Cursor<'_> {
464506 c if !c. is_ascii ( ) && c. is_emoji_char ( ) => self . invalid_ident ( ) ,
465507 _ => Unknown ,
466508 } ;
509+ if matches ! ( self . frontmatter_allowed, FrontmatterAllowed :: Yes )
510+ && !matches ! ( token_kind, Whitespace )
511+ {
512+ // stop allowing frontmatters after first non-whitespace token
513+ self . frontmatter_allowed = FrontmatterAllowed :: No ;
514+ }
467515 let res = Token :: new ( token_kind, self . pos_within_token ( ) ) ;
468516 self . reset_pos_within_token ( ) ;
469517 res
470518 }
471519
520+ /// Given that one `-` was eaten, eat the rest of the frontmatter.
521+ fn frontmatter ( & mut self , has_invalid_preceding_whitespace : bool ) -> TokenKind {
522+ debug_assert_eq ! ( '-' , self . prev( ) ) ;
523+
524+ let pos = self . pos_within_token ( ) ;
525+ self . eat_while ( |c| c == '-' ) ;
526+
527+ // one `-` is eaten by the caller.
528+ let length_opening = self . pos_within_token ( ) - pos + 1 ;
529+
530+ // must be ensured by the caller
531+ debug_assert ! ( length_opening >= 3 ) ;
532+
533+ // whitespace between the opening and the infostring.
534+ self . eat_while ( |ch| ch != '\n' && is_whitespace ( ch) ) ;
535+
536+ // copied from `eat_identifier`, but allows `.` in infostring to allow something like
537+ // `---Cargo.toml` as a valid opener
538+ if is_id_start ( self . first ( ) ) {
539+ self . bump ( ) ;
540+ self . eat_while ( |c| is_id_continue ( c) || c == '.' ) ;
541+ }
542+
543+ self . eat_while ( |ch| ch != '\n' && is_whitespace ( ch) ) ;
544+ let invalid_infostring = self . first ( ) != '\n' ;
545+
546+ let mut s = self . as_str ( ) ;
547+ let mut found = false ;
548+ while let Some ( closing) = s. find ( & "-" . repeat ( length_opening as usize ) ) {
549+ let preceding_chars_start = s[ ..closing] . rfind ( "\n " ) . map_or ( 0 , |i| i + 1 ) ;
550+ if s[ preceding_chars_start..closing] . chars ( ) . all ( is_whitespace) {
551+ // candidate found
552+ self . bump_bytes ( closing) ;
553+ // in case like
554+ // ---cargo
555+ // --- blahblah
556+ // or
557+ // ---cargo
558+ // ----
559+ // combine those stuff into this frontmatter token such that it gets detected later.
560+ self . eat_until ( b'\n' ) ;
561+ found = true ;
562+ break ;
563+ } else {
564+ s = & s[ closing + length_opening as usize ..] ;
565+ }
566+ }
567+
568+ if !found {
569+ // recovery strategy: a closing statement might have precending whitespace/newline
570+ // but not have enough dashes to properly close. In this case, we eat until there,
571+ // and report a mismatch in the parser.
572+ let mut rest = self . as_str ( ) ;
573+ // We can look for a shorter closing (starting with four dashes but closing with three)
574+ // and other indications that Rust has started and the infostring has ended.
575+ let mut potential_closing = rest
576+ . find ( "\n ---" )
577+ // n.b. only in the case where there are dashes, we move the index to the line where
578+ // the dashes start as we eat to include that line. For other cases those are Rust code
579+ // and not included in the frontmatter.
580+ . map ( |x| x + 1 )
581+ . or_else ( || rest. find ( "\n use " ) )
582+ . or_else ( || rest. find ( "\n //!" ) )
583+ . or_else ( || rest. find ( "\n #![" ) ) ;
584+
585+ if potential_closing. is_none ( ) {
586+ // a less fortunate recovery if all else fails which finds any dashes preceded by whitespace
587+ // on a standalone line. Might be wrong.
588+ while let Some ( closing) = rest. find ( "---" ) {
589+ let preceding_chars_start = rest[ ..closing] . rfind ( "\n " ) . map_or ( 0 , |i| i + 1 ) ;
590+ if rest[ preceding_chars_start..closing] . chars ( ) . all ( is_whitespace) {
591+ // candidate found
592+ potential_closing = Some ( closing) ;
593+ break ;
594+ } else {
595+ rest = & rest[ closing + 3 ..] ;
596+ }
597+ }
598+ }
599+
600+ if let Some ( potential_closing) = potential_closing {
601+ // bump to the potential closing, and eat everything on that line.
602+ self . bump_bytes ( potential_closing) ;
603+ self . eat_until ( b'\n' ) ;
604+ } else {
605+ // eat everything. this will get reported as an unclosed frontmatter.
606+ self . eat_while ( |_| true ) ;
607+ }
608+ }
609+
610+ Frontmatter { has_invalid_preceding_whitespace, invalid_infostring }
611+ }
612+
472613 fn line_comment ( & mut self ) -> TokenKind {
473614 debug_assert ! ( self . prev( ) == '/' && self . first( ) == '/' ) ;
474615 self . bump ( ) ;
0 commit comments