@@ -828,19 +828,194 @@ fileprivate extension Compiler.ByteCodeGen {
828828 }
829829 }
830830
831+ /// Flatten quoted strings into groups of
832+ func flatteningCustomCharacterClassMembers(
833+ _ members: [ DSLTree . CustomCharacterClass . Member ]
834+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
835+ var characters : Set < Character > = [ ]
836+ var scalars : Set < UnicodeScalar > = [ ]
837+ var result : [ DSLTree . CustomCharacterClass . Member ] = [ ]
838+ for member in members {
839+ switch member {
840+ case . atom( let atom) :
841+ switch atom {
842+ case let . char( char) :
843+ characters. insert ( char)
844+ case let . scalar( scalar) :
845+ scalars. insert ( scalar)
846+ default :
847+ result. append ( member)
848+ }
849+ case let . quotedLiteral( str) :
850+ characters. formUnion ( str)
851+ default :
852+ result. append ( member)
853+ }
854+ }
855+ result. append ( contentsOf: characters. map { . atom( . char( $0) ) } )
856+ result. append ( contentsOf: scalars. map { . atom( . scalar( $0) ) } )
857+ return result
858+ }
859+
831860 func coalescingCustomCharacterClass(
832861 _ ccc: DSLTree . CustomCharacterClass
833862 ) -> DSLTree . CustomCharacterClass {
834863 // This only needs to be done in grapheme semantic mode. In scalar semantic
835864 // mode, we don't want to coalesce any scalars into a grapheme. This
836865 // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
837866 // U+302.
838- guard options. semanticLevel == . graphemeCluster else { return ccc }
839-
840- let members = coalescingCustomCharacterClassMembers ( ccc. members)
841- return . init( members: members, isInverted: ccc. isInverted)
867+ let members = options. semanticLevel == . graphemeCluster
868+ ? coalescingCustomCharacterClassMembers ( ccc. members)
869+ : ccc. members
870+ return . init(
871+ members: flatteningCustomCharacterClassMembers ( members) ,
872+ isInverted: ccc. isInverted)
842873 }
843874
875+ mutating func emitCharacterInCCC( _ c: Character ) {
876+ switch options. semanticLevel {
877+ case . graphemeCluster:
878+ emitCharacter ( c)
879+ case . unicodeScalar:
880+ // When in scalar mode, act like an alternation of the individual scalars
881+ // that comprise a character.
882+ let done = builder. makeAddress ( )
883+ for scalar in c. unicodeScalars. dropLast ( ) {
884+ let next = builder. makeAddress ( )
885+ builder. buildSave ( next)
886+ emitMatchScalar ( scalar)
887+ builder. buildClear ( )
888+ builder. buildBranch ( to: done)
889+ builder. label ( next)
890+ }
891+ emitMatchScalar ( c. unicodeScalars. last!)
892+ builder. label ( done)
893+ }
894+ }
895+
896+ mutating func emitCCCMember(
897+ _ member: DSLTree . CustomCharacterClass . Member
898+ ) throws {
899+ switch member {
900+ case . atom( let atom) :
901+ switch atom {
902+ case . char( let c) :
903+ emitCharacterInCCC ( c)
904+ case . scalar( let s) :
905+ emitCharacterInCCC ( Character ( s) )
906+ default :
907+ try emitAtom ( atom)
908+ }
909+ case . custom( let ccc) :
910+ try emitCustomCharacterClass ( ccc)
911+ case . quotedLiteral:
912+ fatalError ( " Removed in 'flatteningCustomCharacterClassMembers' " )
913+ case . range:
914+ let consumer = try member. generateConsumer ( options)
915+ builder. buildConsume ( by: consumer)
916+ case . trivia:
917+ return
918+
919+ // TODO: Can we decide when it's better to try `rhs` first?
920+ // Intersection is trivial, since failure on either side propagates:
921+ // - store current position
922+ // - lhs
923+ // - restore current position
924+ // - rhs
925+ case let . intersection( lhs, rhs) :
926+ let r = builder. makePositionRegister ( )
927+ builder. buildMoveCurrentPosition ( into: r)
928+ try emitCustomCharacterClass ( lhs)
929+ builder. buildRestorePosition ( from: r)
930+ try emitCustomCharacterClass ( rhs)
931+
932+ // TODO: Can we decide when it's better to try `rhs` first?
933+ // For subtraction, failure in `lhs` propagates, while failure in `rhs` is
934+ // swallowed/reversed:
935+ // - store current position
936+ // - lhs
937+ // - save to end
938+ // - restore current position
939+ // - rhs
940+ // - clear, fail (since both succeeded)
941+ // - end: ...
942+ case let . subtraction( lhs, rhs) :
943+ let r = builder. makePositionRegister ( )
944+ let end = builder. makeAddress ( )
945+ builder. buildMoveCurrentPosition ( into: r)
946+ try emitCustomCharacterClass ( lhs) // no match here = failure, propagates
947+ builder. buildSave ( end)
948+ builder. buildRestorePosition ( from: r)
949+ try emitCustomCharacterClass ( rhs) // no match here = success, resumes at 'end'
950+ builder. buildClear ( ) // clears 'end'
951+ builder. buildFail ( ) // this failure propagates outward
952+ builder. label ( end)
953+
954+ // Symmetric difference always requires executing both `rhs` and `lhs`.
955+ // Execute each, ignoring failure and storing the resulting position in a
956+ // register. If those results are equal, fail. If they're different, use
957+ // the position that is different from the starting position:
958+ // - store current position as r0
959+ // - save to lhsFail
960+ // - lhs
961+ // - clear lhsFail (and continue)
962+ // - lhsFail: save position as r1
963+ //
964+ // - restore current position
965+ // - save to rhsFail
966+ // - rhs
967+ // - clear rhsFail (and continue)
968+ // - rhsFail: save position as r2
969+ //
970+ // - restore to resulting position from lhs (r1)
971+ // - if equal to r2, goto fail (both sides had same result)
972+ // - if equal to r0, goto advance (lhs failed)
973+ // - goto end
974+ // - advance: restore to resulting position from rhs (r2)
975+ // - goto end
976+ // - fail: fail
977+ // - end: ...
978+ case let . symmetricDifference( lhs, rhs) :
979+ let r0 = builder. makePositionRegister ( )
980+ let r1 = builder. makePositionRegister ( )
981+ let r2 = builder. makePositionRegister ( )
982+ let lhsFail = builder. makeAddress ( )
983+ let rhsFail = builder. makeAddress ( )
984+ let advance = builder. makeAddress ( )
985+ let fail = builder. makeAddress ( )
986+ let end = builder. makeAddress ( )
987+
988+ builder. buildMoveCurrentPosition ( into: r0)
989+ builder. buildSave ( lhsFail)
990+ try emitCustomCharacterClass ( lhs)
991+ builder. buildClear ( )
992+ builder. label ( lhsFail)
993+ builder. buildMoveCurrentPosition ( into: r1)
994+
995+ builder. buildRestorePosition ( from: r0)
996+ builder. buildSave ( rhsFail)
997+ try emitCustomCharacterClass ( rhs)
998+ builder. buildClear ( )
999+ builder. label ( rhsFail)
1000+ builder. buildMoveCurrentPosition ( into: r2)
1001+
1002+ // If r1 == r2, then fail
1003+ builder. buildRestorePosition ( from: r1)
1004+ builder. buildCondBranch ( to: fail, ifSamePositionAs: r2)
1005+
1006+ // If r1 == r0, then move to r2 before ending
1007+ builder. buildCondBranch ( to: advance, ifSamePositionAs: r0)
1008+ builder. buildBranch ( to: end)
1009+ builder. label ( advance)
1010+ builder. buildRestorePosition ( from: r2)
1011+ builder. buildBranch ( to: end)
1012+
1013+ builder. label ( fail)
1014+ builder. buildFail ( )
1015+ builder. label ( end)
1016+ }
1017+ }
1018+
8441019 mutating func emitCustomCharacterClass(
8451020 _ ccc: DSLTree . CustomCharacterClass
8461021 ) throws {
@@ -858,8 +1033,93 @@ fileprivate extension Compiler.ByteCodeGen {
8581033 }
8591034 return
8601035 }
861- let consumer = try ccc. generateConsumer ( options)
862- builder. buildConsume ( by: consumer)
1036+
1037+ let updatedCCC : DSLTree . CustomCharacterClass
1038+ if optimizationsEnabled {
1039+ updatedCCC = ccc. coalescingASCIIMembers ( options)
1040+ } else {
1041+ updatedCCC = ccc
1042+ }
1043+ let filteredMembers = updatedCCC. members. filter ( { !$0. isOnlyTrivia} )
1044+
1045+ if updatedCCC. isInverted {
1046+ // inverted
1047+ // custom character class: p0 | p1 | ... | pn
1048+ // Try each member to make sure they all fail
1049+ // save next_p1
1050+ // <code for p0>
1051+ // clear, fail
1052+ // next_p1:
1053+ // save next_p2
1054+ // <code for p1>
1055+ // clear fail
1056+ // next_p2:
1057+ // save next_p...
1058+ // <code for p2>
1059+ // clear fail
1060+ // ...
1061+ // next_pn:
1062+ // save done
1063+ // <code for pn>
1064+ // clear fail
1065+ // done:
1066+ // step forward by 1
1067+ let done = builder. makeAddress ( )
1068+ for member in filteredMembers. dropLast ( ) {
1069+ let next = builder. makeAddress ( )
1070+ builder. buildSave ( next)
1071+ try emitCCCMember ( member)
1072+ builder. buildClear ( )
1073+ builder. buildFail ( )
1074+ builder. label ( next)
1075+ }
1076+ builder. buildSave ( done)
1077+ try emitCCCMember ( filteredMembers. last!)
1078+ builder. buildClear ( )
1079+ builder. buildFail ( )
1080+ builder. label ( done)
1081+
1082+ // Consume a single unit for the inverted ccc
1083+ switch options. semanticLevel {
1084+ case . graphemeCluster:
1085+ builder. buildAdvance ( 1 )
1086+ case . unicodeScalar:
1087+ builder. buildAdvanceUnicodeScalar ( 1 )
1088+ }
1089+ return
1090+ }
1091+ // non inverted CCC
1092+ // Custom character class: p0 | p1 | ... | pn
1093+ // Very similar to alternation, but we don't keep backtracking save points
1094+ // save next_p1
1095+ // <code for p0>
1096+ // clear
1097+ // branch done
1098+ // next_p1:
1099+ // save next_p2
1100+ // <code for p1>
1101+ // clear
1102+ // branch done
1103+ // next_p2:
1104+ // save next_p...
1105+ // <code for p2>
1106+ // clear
1107+ // branch done
1108+ // ...
1109+ // next_pn:
1110+ // <code for pn>
1111+ // done:
1112+ let done = builder. makeAddress ( )
1113+ for member in filteredMembers. dropLast ( ) {
1114+ let next = builder. makeAddress ( )
1115+ builder. buildSave ( next)
1116+ try emitCCCMember ( member)
1117+ builder. buildClear ( )
1118+ builder. buildBranch ( to: done)
1119+ builder. label ( next)
1120+ }
1121+ try emitCCCMember ( filteredMembers. last!)
1122+ builder. label ( done)
8631123 }
8641124
8651125 mutating func emitConcatenation( _ children: [ DSLTree . Node ] ) throws {
@@ -996,6 +1256,12 @@ fileprivate extension Compiler.ByteCodeGen {
9961256}
9971257
9981258extension DSLTree . Node {
1259+ /// A Boolean value indicating whether this node advances the match position
1260+ /// on a successful match.
1261+ ///
1262+ /// For example, an alternation like `(a|b|c)` always advances the position
1263+ /// by a character, but `(a|b|)` has an empty branch, which matches without
1264+ /// advancing.
9991265 var guaranteesForwardProgress : Bool {
10001266 switch self {
10011267 case . orderedChoice( let children) :
@@ -1026,12 +1292,34 @@ extension DSLTree.Node {
10261292 case . consumer, . matcher:
10271293 // Allow zero width consumers and matchers
10281294 return false
1029- case . customCharacterClass:
1030- return true
1295+ case . customCharacterClass( let ccc ) :
1296+ return ccc . guaranteesForwardProgress
10311297 case . quantification( let amount, _, let child) :
10321298 let ( atLeast, _) = amount. ast. bounds
10331299 return atLeast ?? 0 > 0 && child. guaranteesForwardProgress
10341300 default : return false
10351301 }
10361302 }
10371303}
1304+
1305+ extension DSLTree . CustomCharacterClass {
1306+ /// We allow trivia into CustomCharacterClass, which could result in a CCC
1307+ /// that matches nothing, ie `(?x)[ ]`.
1308+ var guaranteesForwardProgress : Bool {
1309+ for m in members {
1310+ switch m {
1311+ case . trivia:
1312+ continue
1313+ case let . intersection( lhs, rhs) :
1314+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1315+ case let . subtraction( lhs, _) :
1316+ return lhs. guaranteesForwardProgress
1317+ case let . symmetricDifference( lhs, rhs) :
1318+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1319+ default :
1320+ return true
1321+ }
1322+ }
1323+ return false
1324+ }
1325+ }
0 commit comments