1515#![ doc( primitive = "str" ) ]
1616#![ stable( feature = "rust1" , since = "1.0.0" ) ]
1717
18- use self :: OldSearcher :: { TwoWay , TwoWayLong } ;
1918use self :: pattern:: Pattern ;
2019use self :: pattern:: { Searcher , ReverseSearcher , DoubleEndedSearcher } ;
2120
2221use char:: CharExt ;
2322use clone:: Clone ;
24- use cmp:: { self , Eq } ;
23+ use cmp:: Eq ;
2524use convert:: AsRef ;
2625use default:: Default ;
2726use fmt;
@@ -33,7 +32,6 @@ use option::Option::{self, None, Some};
3332use raw:: { Repr , Slice } ;
3433use result:: Result :: { self , Ok , Err } ;
3534use slice:: { self , SliceExt } ;
36- use usize;
3735
3836pub mod pattern;
3937
@@ -870,301 +868,6 @@ impl<'a> DoubleEndedIterator for LinesAny<'a> {
870868 }
871869}
872870
873- /// The internal state of an iterator that searches for matches of a substring
874- /// within a larger string using two-way search
875- #[ derive( Clone ) ]
876- struct TwoWaySearcher {
877- // constants
878- crit_pos : usize ,
879- period : usize ,
880- byteset : u64 ,
881-
882- // variables
883- position : usize ,
884- memory : usize
885- }
886-
887- /*
888- This is the Two-Way search algorithm, which was introduced in the paper:
889- Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
890-
891- Here's some background information.
892-
893- A *word* is a string of symbols. The *length* of a word should be a familiar
894- notion, and here we denote it for any word x by |x|.
895- (We also allow for the possibility of the *empty word*, a word of length zero).
896-
897- If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
898- *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
899- For example, both 1 and 2 are periods for the string "aa". As another example,
900- the only period of the string "abcd" is 4.
901-
902- We denote by period(x) the *smallest* period of x (provided that x is non-empty).
903- This is always well-defined since every non-empty word x has at least one period,
904- |x|. We sometimes call this *the period* of x.
905-
906- If u, v and x are words such that x = uv, where uv is the concatenation of u and
907- v, then we say that (u, v) is a *factorization* of x.
908-
909- Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
910- that both of the following hold
911-
912- - either w is a suffix of u or u is a suffix of w
913- - either w is a prefix of v or v is a prefix of w
914-
915- then w is said to be a *repetition* for the factorization (u, v).
916-
917- Just to unpack this, there are four possibilities here. Let w = "abc". Then we
918- might have:
919-
920- - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
921- - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
922- - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
923- - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
924-
925- Note that the word vu is a repetition for any factorization (u,v) of x = uv,
926- so every factorization has at least one repetition.
927-
928- If x is a string and (u, v) is a factorization for x, then a *local period* for
929- (u, v) is an integer r such that there is some word w such that |w| = r and w is
930- a repetition for (u, v).
931-
932- We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
933- call this *the local period* of (u, v). Provided that x = uv is non-empty, this
934- is well-defined (because each non-empty word has at least one factorization, as
935- noted above).
936-
937- It can be proven that the following is an equivalent definition of a local period
938- for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
939- all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
940- defined. (i.e. i > 0 and i + r < |x|).
941-
942- Using the above reformulation, it is easy to prove that
943-
944- 1 <= local_period(u, v) <= period(uv)
945-
946- A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
947- *critical factorization*.
948-
949- The algorithm hinges on the following theorem, which is stated without proof:
950-
951- **Critical Factorization Theorem** Any word x has at least one critical
952- factorization (u, v) such that |u| < period(x).
953-
954- The purpose of maximal_suffix is to find such a critical factorization.
955-
956- */
957- impl TwoWaySearcher {
958- #[ allow( dead_code) ]
959- fn new ( needle : & [ u8 ] ) -> TwoWaySearcher {
960- let ( crit_pos_false, period_false) = TwoWaySearcher :: maximal_suffix ( needle, false ) ;
961- let ( crit_pos_true, period_true) = TwoWaySearcher :: maximal_suffix ( needle, true ) ;
962-
963- let ( crit_pos, period) =
964- if crit_pos_false > crit_pos_true {
965- ( crit_pos_false, period_false)
966- } else {
967- ( crit_pos_true, period_true)
968- } ;
969-
970- // This isn't in the original algorithm, as far as I'm aware.
971- let byteset = needle. iter ( )
972- . fold ( 0 , |a, & b| ( 1 << ( ( b & 0x3f ) as usize ) ) | a) ;
973-
974- // A particularly readable explanation of what's going on here can be found
975- // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
976- // see the code for "Algorithm CP" on p. 323.
977- //
978- // What's going on is we have some critical factorization (u, v) of the
979- // needle, and we want to determine whether u is a suffix of
980- // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
981- // "Algorithm CP2", which is optimized for when the period of the needle
982- // is large.
983- if & needle[ ..crit_pos] == & needle[ period.. period + crit_pos] {
984- TwoWaySearcher {
985- crit_pos : crit_pos,
986- period : period,
987- byteset : byteset,
988-
989- position : 0 ,
990- memory : 0
991- }
992- } else {
993- TwoWaySearcher {
994- crit_pos : crit_pos,
995- period : cmp:: max ( crit_pos, needle. len ( ) - crit_pos) + 1 ,
996- byteset : byteset,
997-
998- position : 0 ,
999- memory : usize:: MAX // Dummy value to signify that the period is long
1000- }
1001- }
1002- }
1003-
1004- // One of the main ideas of Two-Way is that we factorize the needle into
1005- // two halves, (u, v), and begin trying to find v in the haystack by scanning
1006- // left to right. If v matches, we try to match u by scanning right to left.
1007- // How far we can jump when we encounter a mismatch is all based on the fact
1008- // that (u, v) is a critical factorization for the needle.
1009- #[ inline]
1010- fn next ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] , long_period : bool )
1011- -> Option < ( usize , usize ) > {
1012- ' search: loop {
1013- // Check that we have room to search in
1014- if self . position + needle. len ( ) > haystack. len ( ) {
1015- return None ;
1016- }
1017-
1018- // Quickly skip by large portions unrelated to our substring
1019- if ( self . byteset >>
1020- ( ( haystack[ self . position + needle. len ( ) - 1 ] & 0x3f )
1021- as usize ) ) & 1 == 0 {
1022- self . position += needle. len ( ) ;
1023- if !long_period {
1024- self . memory = 0 ;
1025- }
1026- continue ' search;
1027- }
1028-
1029- // See if the right part of the needle matches
1030- let start = if long_period { self . crit_pos }
1031- else { cmp:: max ( self . crit_pos , self . memory ) } ;
1032- for i in start..needle. len ( ) {
1033- if needle[ i] != haystack[ self . position + i] {
1034- self . position += i - self . crit_pos + 1 ;
1035- if !long_period {
1036- self . memory = 0 ;
1037- }
1038- continue ' search;
1039- }
1040- }
1041-
1042- // See if the left part of the needle matches
1043- let start = if long_period { 0 } else { self . memory } ;
1044- for i in ( start..self . crit_pos ) . rev ( ) {
1045- if needle[ i] != haystack[ self . position + i] {
1046- self . position += self . period ;
1047- if !long_period {
1048- self . memory = needle. len ( ) - self . period ;
1049- }
1050- continue ' search;
1051- }
1052- }
1053-
1054- // We have found a match!
1055- let match_pos = self . position ;
1056- self . position += needle. len ( ) ; // add self.period for all matches
1057- if !long_period {
1058- self . memory = 0 ; // set to needle.len() - self.period for all matches
1059- }
1060- return Some ( ( match_pos, match_pos + needle. len ( ) ) ) ;
1061- }
1062- }
1063-
1064- // Computes a critical factorization (u, v) of `arr`.
1065- // Specifically, returns (i, p), where i is the starting index of v in some
1066- // critical factorization (u, v) and p = period(v)
1067- #[ inline]
1068- #[ allow( dead_code) ]
1069- #[ allow( deprecated) ]
1070- fn maximal_suffix ( arr : & [ u8 ] , reversed : bool ) -> ( usize , usize ) {
1071- let mut left: usize = !0 ; // Corresponds to i in the paper
1072- let mut right = 0 ; // Corresponds to j in the paper
1073- let mut offset = 1 ; // Corresponds to k in the paper
1074- let mut period = 1 ; // Corresponds to p in the paper
1075-
1076- while right + offset < arr. len ( ) {
1077- let a;
1078- let b;
1079- if reversed {
1080- a = arr[ left. wrapping_add ( offset) ] ;
1081- b = arr[ right + offset] ;
1082- } else {
1083- a = arr[ right + offset] ;
1084- b = arr[ left. wrapping_add ( offset) ] ;
1085- }
1086- if a < b {
1087- // Suffix is smaller, period is entire prefix so far.
1088- right += offset;
1089- offset = 1 ;
1090- period = right. wrapping_sub ( left) ;
1091- } else if a == b {
1092- // Advance through repetition of the current period.
1093- if offset == period {
1094- right += offset;
1095- offset = 1 ;
1096- } else {
1097- offset += 1 ;
1098- }
1099- } else {
1100- // Suffix is larger, start over from current location.
1101- left = right;
1102- right += 1 ;
1103- offset = 1 ;
1104- period = 1 ;
1105- }
1106- }
1107- ( left. wrapping_add ( 1 ) , period)
1108- }
1109- }
1110-
1111- /// The internal state of an iterator that searches for matches of a substring
1112- /// within a larger string using a dynamically chosen search algorithm
1113- #[ derive( Clone ) ]
1114- // NB: This is kept around for convenience because
1115- // it is planned to be used again in the future
1116- enum OldSearcher {
1117- TwoWay ( TwoWaySearcher ) ,
1118- TwoWayLong ( TwoWaySearcher ) ,
1119- }
1120-
1121- impl OldSearcher {
1122- #[ allow( dead_code) ]
1123- fn new ( haystack : & [ u8 ] , needle : & [ u8 ] ) -> OldSearcher {
1124- if needle. is_empty ( ) {
1125- // Handle specially
1126- unimplemented ! ( )
1127- // FIXME: Tune this.
1128- // FIXME(#16715): This unsigned integer addition will probably not
1129- // overflow because that would mean that the memory almost solely
1130- // consists of the needle. Needs #16715 to be formally fixed.
1131- } else if needle. len ( ) + 20 > haystack. len ( ) {
1132- // Use naive searcher
1133- unimplemented ! ( )
1134- } else {
1135- let searcher = TwoWaySearcher :: new ( needle) ;
1136- if searcher. memory == usize:: MAX { // If the period is long
1137- TwoWayLong ( searcher)
1138- } else {
1139- TwoWay ( searcher)
1140- }
1141- }
1142- }
1143- }
1144-
1145- #[ derive( Clone ) ]
1146- // NB: This is kept around for convenience because
1147- // it is planned to be used again in the future
1148- struct OldMatchIndices < ' a , ' b > {
1149- // constants
1150- haystack : & ' a str ,
1151- needle : & ' b str ,
1152- searcher : OldSearcher
1153- }
1154-
1155- impl < ' a , ' b > OldMatchIndices < ' a , ' b > {
1156- #[ inline]
1157- #[ allow( dead_code) ]
1158- fn next ( & mut self ) -> Option < ( usize , usize ) > {
1159- match self . searcher {
1160- TwoWay ( ref mut searcher)
1161- => searcher. next ( self . haystack . as_bytes ( ) , self . needle . as_bytes ( ) , false ) ,
1162- TwoWayLong ( ref mut searcher)
1163- => searcher. next ( self . haystack . as_bytes ( ) , self . needle . as_bytes ( ) , true ) ,
1164- }
1165- }
1166- }
1167-
1168871/*
1169872Section: Comparing strings
1170873*/
0 commit comments