@@ -544,11 +544,7 @@ pub struct StrSearcher<'a, 'b> {
544544#[ derive( Clone , Debug ) ]
545545enum StrSearcherImpl {
546546 Empty ( EmptyNeedle ) ,
547- TwoWay {
548- last_match_fw : Option < ( usize , usize ) > ,
549- last_match_bw : Option < ( usize , usize ) > ,
550- searcher : TwoWaySearcher ,
551- }
547+ TwoWay ( TwoWaySearcher ) ,
552548}
553549
554550#[ derive( Clone , Debug ) ]
@@ -576,11 +572,9 @@ impl<'a, 'b> StrSearcher<'a, 'b> {
576572 StrSearcher {
577573 haystack : haystack,
578574 needle : needle,
579- searcher : StrSearcherImpl :: TwoWay {
580- last_match_fw : None ,
581- last_match_bw : None ,
582- searcher : TwoWaySearcher :: new ( needle. as_bytes ( ) , haystack. len ( ) )
583- } ,
575+ searcher : StrSearcherImpl :: TwoWay (
576+ TwoWaySearcher :: new ( needle. as_bytes ( ) , haystack. len ( ) )
577+ ) ,
584578 }
585579 }
586580 }
@@ -606,39 +600,55 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
606600 }
607601 }
608602 }
609- StrSearcherImpl :: TwoWay { ref mut last_match_fw , ref mut searcher, .. } => {
603+ StrSearcherImpl :: TwoWay ( ref mut searcher) => {
610604 // TwoWaySearcher produces valid *Match* indices that split at char boundaries
611605 // as long as it does correct matching and that haystack and needle are
612606 // valid UTF-8
613- // *Rejects* fall on the same indices (the intervals between matches)
614- // so they are always on character boundaries .
615- if let Some ( ( a , b ) ) = last_match_fw . take ( ) {
616- return SearchStep :: Match ( a , b ) ;
607+ // *Rejects* from the algorithm can fall on any indices, but we will walk them
608+ // manually to the next character boundary, so that they are utf-8 safe .
609+ if searcher . position == self . haystack . len ( ) {
610+ return SearchStep :: Done ;
617611 }
618- let last_pos = searcher. position ;
619612 let is_long = searcher. memory == usize:: MAX ;
620- let next_match = searcher. next ( self . haystack . as_bytes ( ) ,
621- self . needle . as_bytes ( ) ,
622- is_long) ;
623- match next_match {
624- None => if last_pos != self . haystack . len ( ) {
625- SearchStep :: Reject ( last_pos, self . haystack . len ( ) )
626- } else {
627- SearchStep :: Done
628- } ,
629- Some ( ( a, b) ) => {
630- if a == last_pos {
631- SearchStep :: Match ( a, b)
632- } else {
633- * last_match_fw = Some ( ( a, b) ) ;
634- SearchStep :: Reject ( last_pos, a)
613+ match searcher. next :: < RejectAndMatch > ( self . haystack . as_bytes ( ) ,
614+ self . needle . as_bytes ( ) ,
615+ is_long)
616+ {
617+ SearchStep :: Reject ( a, mut b) => {
618+ // skip to next char boundary
619+ while !self . haystack . is_char_boundary ( b) {
620+ b += 1 ;
635621 }
622+ searcher. position = cmp:: max ( b, searcher. position ) ;
623+ SearchStep :: Reject ( a, b)
636624 }
625+ otherwise => otherwise,
637626 }
638627 }
639628 }
640629 }
641630
631+ #[ inline]
632+ fn next_match ( & mut self ) -> Option < ( usize , usize ) > {
633+ match self . searcher {
634+ StrSearcherImpl :: Empty ( ..) => {
635+ loop {
636+ match self . next ( ) {
637+ SearchStep :: Match ( a, b) => return Some ( ( a, b) ) ,
638+ SearchStep :: Done => return None ,
639+ SearchStep :: Reject ( ..) => { }
640+ }
641+ }
642+ }
643+ StrSearcherImpl :: TwoWay ( ref mut searcher) => {
644+ let is_long = searcher. memory == usize:: MAX ;
645+ searcher. next :: < MatchOnly > ( self . haystack . as_bytes ( ) ,
646+ self . needle . as_bytes ( ) ,
647+ is_long)
648+ }
649+ }
650+ }
651+
642652}
643653unsafe impl < ' a , ' b > ReverseSearcher < ' a > for StrSearcher < ' a , ' b > {
644654 #[ inline]
@@ -657,31 +667,45 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
657667 }
658668 }
659669 }
660- StrSearcherImpl :: TwoWay { ref mut last_match_bw , ref mut searcher, .. } => {
661- if let Some ( ( a , b ) ) = last_match_bw . take ( ) {
662- return SearchStep :: Match ( a , b ) ;
670+ StrSearcherImpl :: TwoWay ( ref mut searcher) => {
671+ if searcher . end == 0 {
672+ return SearchStep :: Done ;
663673 }
664- let last_end = searcher. end ;
665- let next_match = searcher. next_back ( self . haystack . as_bytes ( ) ,
666- self . needle . as_bytes ( ) ) ;
667- match next_match {
668- None => if last_end != 0 {
669- SearchStep :: Reject ( 0 , last_end)
670- } else {
671- SearchStep :: Done
672- } ,
673- Some ( ( a, b) ) => {
674- if b == last_end {
675- SearchStep :: Match ( a, b)
676- } else {
677- * last_match_bw = Some ( ( a, b) ) ;
678- SearchStep :: Reject ( b, last_end)
674+ match searcher. next_back :: < RejectAndMatch > ( self . haystack . as_bytes ( ) ,
675+ self . needle . as_bytes ( ) )
676+ {
677+ SearchStep :: Reject ( mut a, b) => {
678+ // skip to next char boundary
679+ while !self . haystack . is_char_boundary ( a) {
680+ a -= 1 ;
679681 }
682+ searcher. end = cmp:: min ( a, searcher. end ) ;
683+ SearchStep :: Reject ( a, b)
680684 }
685+ otherwise => otherwise,
681686 }
682687 }
683688 }
684689 }
690+
691+ #[ inline]
692+ fn next_match_back ( & mut self ) -> Option < ( usize , usize ) > {
693+ match self . searcher {
694+ StrSearcherImpl :: Empty ( ..) => {
695+ loop {
696+ match self . next_back ( ) {
697+ SearchStep :: Match ( a, b) => return Some ( ( a, b) ) ,
698+ SearchStep :: Done => return None ,
699+ SearchStep :: Reject ( ..) => { }
700+ }
701+ }
702+ }
703+ StrSearcherImpl :: TwoWay ( ref mut searcher) => {
704+ searcher. next_back :: < MatchOnly > ( self . haystack . as_bytes ( ) ,
705+ self . needle . as_bytes ( ) )
706+ }
707+ }
708+ }
685709}
686710
687711/// The internal state of an iterator that searches for matches of a substring
@@ -831,14 +855,21 @@ impl TwoWaySearcher {
831855 // How far we can jump when we encounter a mismatch is all based on the fact
832856 // that (u, v) is a critical factorization for the needle.
833857 #[ inline]
834- fn next ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] , long_period : bool )
835- -> Option < ( usize , usize ) > {
858+ fn next < S > ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] , long_period : bool )
859+ -> S :: Output
860+ where S : TwoWayStrategy
861+ {
836862 // `next()` uses `self.position` as its cursor
863+ let old_pos = self . position ;
837864 ' search: loop {
838865 // Check that we have room to search in
839- if self . position + needle. len ( ) > haystack. len ( ) {
866+ if needle. len ( ) > haystack. len ( ) - self . position {
840867 self . position = haystack. len ( ) ;
841- return None ;
868+ return S :: rejecting ( old_pos, self . position ) ;
869+ }
870+
871+ if S :: use_early_reject ( ) && old_pos != self . position {
872+ return S :: rejecting ( old_pos, self . position ) ;
842873 }
843874
844875 // Quickly skip by large portions unrelated to our substring
@@ -884,7 +915,7 @@ impl TwoWaySearcher {
884915 self . memory = 0 ; // set to needle.len() - self.period for overlapping matches
885916 }
886917
887- return Some ( ( match_pos, match_pos + needle. len ( ) ) ) ;
918+ return S :: matching ( match_pos, match_pos + needle. len ( ) ) ;
888919 }
889920 }
890921
@@ -902,14 +933,22 @@ impl TwoWaySearcher {
902933 // a reversed haystack with a reversed needle, and the above paragraph shows
903934 // that the precomputed parameters can be left alone.
904935 #[ inline]
905- fn next_back ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] ) -> Option < ( usize , usize ) > {
936+ fn next_back < S > ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] )
937+ -> S :: Output
938+ where S : TwoWayStrategy
939+ {
906940 // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()`
907941 // are independent.
942+ let old_end = self . end ;
908943 ' search: loop {
909944 // Check that we have room to search in
910945 if needle. len ( ) > self . end {
911946 self . end = 0 ;
912- return None ;
947+ return S :: rejecting ( 0 , old_end) ;
948+ }
949+
950+ if S :: use_early_reject ( ) && old_end != self . end {
951+ return S :: rejecting ( self . end , old_end) ;
913952 }
914953
915954 // Quickly skip by large portions unrelated to our substring
@@ -939,7 +978,7 @@ impl TwoWaySearcher {
939978 // Note: sub self.period instead of needle.len() to have overlapping matches
940979 self . end -= needle. len ( ) ;
941980
942- return Some ( ( match_pos, match_pos + needle. len ( ) ) ) ;
981+ return S :: matching ( match_pos, match_pos + needle. len ( ) ) ;
943982 }
944983 }
945984
@@ -987,3 +1026,40 @@ impl TwoWaySearcher {
9871026 ( left. wrapping_add ( 1 ) , period)
9881027 }
9891028}
1029+
1030+ // TwoWayStrategy allows the algorithm to either skip non-matches as quickly
1031+ // as possible, or to work in a mode where it emits Rejects relatively quickly.
1032+ trait TwoWayStrategy {
1033+ type Output ;
1034+ fn use_early_reject ( ) -> bool ;
1035+ fn rejecting ( usize , usize ) -> Self :: Output ;
1036+ fn matching ( usize , usize ) -> Self :: Output ;
1037+ }
1038+
1039+ /// Skip to match intervals as quickly as possible
1040+ enum MatchOnly { }
1041+
1042+ impl TwoWayStrategy for MatchOnly {
1043+ type Output = Option < ( usize , usize ) > ;
1044+
1045+ #[ inline]
1046+ fn use_early_reject ( ) -> bool { false }
1047+ #[ inline]
1048+ fn rejecting ( _a : usize , _b : usize ) -> Self :: Output { None }
1049+ #[ inline]
1050+ fn matching ( a : usize , b : usize ) -> Self :: Output { Some ( ( a, b) ) }
1051+ }
1052+
1053+ /// Emit Rejects regularly
1054+ enum RejectAndMatch { }
1055+
1056+ impl TwoWayStrategy for RejectAndMatch {
1057+ type Output = SearchStep ;
1058+
1059+ #[ inline]
1060+ fn use_early_reject ( ) -> bool { true }
1061+ #[ inline]
1062+ fn rejecting ( a : usize , b : usize ) -> Self :: Output { SearchStep :: Reject ( a, b) }
1063+ #[ inline]
1064+ fn matching ( a : usize , b : usize ) -> Self :: Output { SearchStep :: Match ( a, b) }
1065+ }
0 commit comments