@@ -243,9 +243,11 @@ fileprivate extension Compiler.ByteCodeGen {
243243 }
244244 }
245245
246- mutating func emitAlternation(
247- _ children: [ DSLTree . Node ]
248- ) throws {
246+ mutating func emitAlternationGen< C: BidirectionalCollection > (
247+ _ elements: C ,
248+ withBacktracking: Bool ,
249+ _ body: ( inout Compiler . ByteCodeGen , C . Element ) throws -> Void
250+ ) rethrows {
249251 // Alternation: p0 | p1 | ... | pn
250252 // save next_p1
251253 // <code for p0>
@@ -263,16 +265,27 @@ fileprivate extension Compiler.ByteCodeGen {
263265 // <code for pn>
264266 // done:
265267 let done = builder. makeAddress ( )
266- for component in children . dropLast ( ) {
268+ for element in elements . dropLast ( ) {
267269 let next = builder. makeAddress ( )
268270 builder. buildSave ( next)
269- try emitNode ( component)
271+ try body ( & self , element)
272+ if !withBacktracking {
273+ builder. buildClear ( )
274+ }
270275 builder. buildBranch ( to: done)
271276 builder. label ( next)
272277 }
273- try emitNode ( children . last!)
278+ try body ( & self , elements . last!)
274279 builder. label ( done)
275280 }
281+
282+ mutating func emitAlternation(
283+ _ children: [ DSLTree . Node ]
284+ ) throws {
285+ try emitAlternationGen ( children, withBacktracking: true ) {
286+ try $0. emitNode ( $1)
287+ }
288+ }
276289
277290 mutating func emitConcatenationComponent(
278291 _ node: DSLTree . Node
@@ -828,19 +841,187 @@ fileprivate extension Compiler.ByteCodeGen {
828841 }
829842 }
830843
844+ /// Flatten quoted strings into sequences of atoms, so that the standard
845+ /// CCC codegen will handle them.
846+ func flatteningCustomCharacterClassMembers(
847+ _ members: [ DSLTree . CustomCharacterClass . Member ]
848+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
849+ var characters : Set < Character > = [ ]
850+ var scalars : Set < UnicodeScalar > = [ ]
851+ var result : [ DSLTree . CustomCharacterClass . Member ] = [ ]
852+ for member in members {
853+ switch member {
854+ case . atom( let atom) :
855+ switch atom {
856+ case let . char( char) :
857+ characters. insert ( char)
858+ case let . scalar( scalar) :
859+ scalars. insert ( scalar)
860+ default :
861+ result. append ( member)
862+ }
863+ case let . quotedLiteral( str) :
864+ characters. formUnion ( str)
865+ default :
866+ result. append ( member)
867+ }
868+ }
869+ result. append ( contentsOf: characters. map { . atom( . char( $0) ) } )
870+ result. append ( contentsOf: scalars. map { . atom( . scalar( $0) ) } )
871+ return result
872+ }
873+
831874 func coalescingCustomCharacterClass(
832875 _ ccc: DSLTree . CustomCharacterClass
833876 ) -> DSLTree . CustomCharacterClass {
834877 // This only needs to be done in grapheme semantic mode. In scalar semantic
835878 // mode, we don't want to coalesce any scalars into a grapheme. This
836879 // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
837880 // U+302.
838- guard options. semanticLevel == . graphemeCluster else { return ccc }
839-
840- let members = coalescingCustomCharacterClassMembers ( ccc. members)
841- return . init( members: members, isInverted: ccc. isInverted)
881+ let members = options. semanticLevel == . graphemeCluster
882+ ? coalescingCustomCharacterClassMembers ( ccc. members)
883+ : ccc. members
884+ return . init(
885+ members: flatteningCustomCharacterClassMembers ( members) ,
886+ isInverted: ccc. isInverted)
842887 }
843888
889+ mutating func emitCharacterInCCC( _ c: Character ) {
890+ switch options. semanticLevel {
891+ case . graphemeCluster:
892+ emitCharacter ( c)
893+ case . unicodeScalar:
894+ // When in scalar mode, act like an alternation of the individual scalars
895+ // that comprise a character.
896+ emitAlternationGen ( c. unicodeScalars, withBacktracking: false ) {
897+ $0. emitMatchScalar ( $1)
898+ }
899+ }
900+ }
901+
902+ mutating func emitCCCMember(
903+ _ member: DSLTree . CustomCharacterClass . Member
904+ ) throws {
905+ switch member {
906+ case . atom( let atom) :
907+ switch atom {
908+ case . char( let c) :
909+ emitCharacterInCCC ( c)
910+ case . scalar( let s) :
911+ emitCharacterInCCC ( Character ( s) )
912+ default :
913+ try emitAtom ( atom)
914+ }
915+ case . custom( let ccc) :
916+ try emitCustomCharacterClass ( ccc)
917+ case . quotedLiteral:
918+ fatalError ( " Removed in 'flatteningCustomCharacterClassMembers' " )
919+ case . range:
920+ let consumer = try member. generateConsumer ( options)
921+ builder. buildConsume ( by: consumer)
922+ case . trivia:
923+ return
924+
925+ // TODO: Can we decide when it's better to try `rhs` first?
926+ // Intersection is trivial, since failure on either side propagates:
927+ // - store current position
928+ // - lhs
929+ // - restore current position
930+ // - rhs
931+ case let . intersection( lhs, rhs) :
932+ let r = builder. makePositionRegister ( )
933+ builder. buildMoveCurrentPosition ( into: r)
934+ try emitCustomCharacterClass ( lhs)
935+ builder. buildRestorePosition ( from: r)
936+ try emitCustomCharacterClass ( rhs)
937+
938+ // TODO: Can we decide when it's better to try `rhs` first?
939+ // For subtraction, failure in `lhs` propagates, while failure in `rhs` is
940+ // swallowed/reversed:
941+ // - store current position
942+ // - lhs
943+ // - save to end
944+ // - restore current position
945+ // - rhs
946+ // - clear, fail (since both succeeded)
947+ // - end: ...
948+ case let . subtraction( lhs, rhs) :
949+ let r = builder. makePositionRegister ( )
950+ let end = builder. makeAddress ( )
951+ builder. buildMoveCurrentPosition ( into: r)
952+ try emitCustomCharacterClass ( lhs) // no match here = failure, propagates
953+ builder. buildSave ( end)
954+ builder. buildRestorePosition ( from: r)
955+ try emitCustomCharacterClass ( rhs) // no match here = success, resumes at 'end'
956+ builder. buildClear ( ) // clears 'end'
957+ builder. buildFail ( ) // this failure propagates outward
958+ builder. label ( end)
959+
960+ // Symmetric difference always requires executing both `rhs` and `lhs`.
961+ // Execute each, ignoring failure and storing the resulting position in a
962+ // register. If those results are equal, fail. If they're different, use
963+ // the position that is different from the starting position:
964+ // - store current position as r0
965+ // - save to lhsFail
966+ // - lhs
967+ // - clear lhsFail (and continue)
968+ // - lhsFail: save position as r1
969+ //
970+ // - restore current position
971+ // - save to rhsFail
972+ // - rhs
973+ // - clear rhsFail (and continue)
974+ // - rhsFail: save position as r2
975+ //
976+ // - restore to resulting position from lhs (r1)
977+ // - if equal to r2, goto fail (both sides had same result)
978+ // - if equal to r0, goto advance (lhs failed)
979+ // - goto end
980+ // - advance: restore to resulting position from rhs (r2)
981+ // - goto end
982+ // - fail: fail
983+ // - end: ...
984+ case let . symmetricDifference( lhs, rhs) :
985+ let r0 = builder. makePositionRegister ( )
986+ let r1 = builder. makePositionRegister ( )
987+ let r2 = builder. makePositionRegister ( )
988+ let lhsFail = builder. makeAddress ( )
989+ let rhsFail = builder. makeAddress ( )
990+ let advance = builder. makeAddress ( )
991+ let fail = builder. makeAddress ( )
992+ let end = builder. makeAddress ( )
993+
994+ builder. buildMoveCurrentPosition ( into: r0)
995+ builder. buildSave ( lhsFail)
996+ try emitCustomCharacterClass ( lhs)
997+ builder. buildClear ( )
998+ builder. label ( lhsFail)
999+ builder. buildMoveCurrentPosition ( into: r1)
1000+
1001+ builder. buildRestorePosition ( from: r0)
1002+ builder. buildSave ( rhsFail)
1003+ try emitCustomCharacterClass ( rhs)
1004+ builder. buildClear ( )
1005+ builder. label ( rhsFail)
1006+ builder. buildMoveCurrentPosition ( into: r2)
1007+
1008+ // If r1 == r2, then fail
1009+ builder. buildRestorePosition ( from: r1)
1010+ builder. buildCondBranch ( to: fail, ifSamePositionAs: r2)
1011+
1012+ // If r1 == r0, then move to r2 before ending
1013+ builder. buildCondBranch ( to: advance, ifSamePositionAs: r0)
1014+ builder. buildBranch ( to: end)
1015+ builder. label ( advance)
1016+ builder. buildRestorePosition ( from: r2)
1017+ builder. buildBranch ( to: end)
1018+
1019+ builder. label ( fail)
1020+ builder. buildFail ( )
1021+ builder. label ( end)
1022+ }
1023+ }
1024+
8441025 mutating func emitCustomCharacterClass(
8451026 _ ccc: DSLTree . CustomCharacterClass
8461027 ) throws {
@@ -858,8 +1039,67 @@ fileprivate extension Compiler.ByteCodeGen {
8581039 }
8591040 return
8601041 }
861- let consumer = try ccc. generateConsumer ( options)
862- builder. buildConsume ( by: consumer)
1042+
1043+ let updatedCCC : DSLTree . CustomCharacterClass
1044+ if optimizationsEnabled {
1045+ updatedCCC = ccc. coalescingASCIIMembers ( options)
1046+ } else {
1047+ updatedCCC = ccc
1048+ }
1049+ let filteredMembers = updatedCCC. members. filter ( { !$0. isOnlyTrivia} )
1050+
1051+ if updatedCCC. isInverted {
1052+ // inverted
1053+ // custom character class: p0 | p1 | ... | pn
1054+ // Try each member to make sure they all fail
1055+ // save next_p1
1056+ // <code for p0>
1057+ // clear, fail
1058+ // next_p1:
1059+ // save next_p2
1060+ // <code for p1>
1061+ // clear fail
1062+ // next_p2:
1063+ // save next_p...
1064+ // <code for p2>
1065+ // clear fail
1066+ // ...
1067+ // next_pn:
1068+ // save done
1069+ // <code for pn>
1070+ // clear fail
1071+ // done:
1072+ // step forward by 1
1073+ let done = builder. makeAddress ( )
1074+ for member in filteredMembers. dropLast ( ) {
1075+ let next = builder. makeAddress ( )
1076+ builder. buildSave ( next)
1077+ try emitCCCMember ( member)
1078+ builder. buildClear ( )
1079+ builder. buildFail ( )
1080+ builder. label ( next)
1081+ }
1082+ builder. buildSave ( done)
1083+ try emitCCCMember ( filteredMembers. last!)
1084+ builder. buildClear ( )
1085+ builder. buildFail ( )
1086+ builder. label ( done)
1087+
1088+ // Consume a single unit for the inverted ccc
1089+ switch options. semanticLevel {
1090+ case . graphemeCluster:
1091+ builder. buildAdvance ( 1 )
1092+ case . unicodeScalar:
1093+ builder. buildAdvanceUnicodeScalar ( 1 )
1094+ }
1095+ return
1096+ }
1097+ // non inverted CCC
1098+ // Custom character class: p0 | p1 | ... | pn
1099+ // Very similar to alternation, but we don't keep backtracking save points
1100+ try emitAlternationGen ( filteredMembers, withBacktracking: false ) {
1101+ try $0. emitCCCMember ( $1)
1102+ }
8631103 }
8641104
8651105 mutating func emitConcatenation( _ children: [ DSLTree . Node ] ) throws {
@@ -996,6 +1236,12 @@ fileprivate extension Compiler.ByteCodeGen {
9961236}
9971237
9981238extension DSLTree . Node {
1239+ /// A Boolean value indicating whether this node advances the match position
1240+ /// on a successful match.
1241+ ///
1242+ /// For example, an alternation like `(a|b|c)` always advances the position
1243+ /// by a character, but `(a|b|)` has an empty branch, which matches without
1244+ /// advancing.
9991245 var guaranteesForwardProgress : Bool {
10001246 switch self {
10011247 case . orderedChoice( let children) :
@@ -1026,12 +1272,34 @@ extension DSLTree.Node {
10261272 case . consumer, . matcher:
10271273 // Allow zero width consumers and matchers
10281274 return false
1029- case . customCharacterClass:
1030- return true
1275+ case . customCharacterClass( let ccc ) :
1276+ return ccc . guaranteesForwardProgress
10311277 case . quantification( let amount, _, let child) :
10321278 let ( atLeast, _) = amount. ast. bounds
10331279 return atLeast ?? 0 > 0 && child. guaranteesForwardProgress
10341280 default : return false
10351281 }
10361282 }
10371283}
1284+
1285+ extension DSLTree . CustomCharacterClass {
1286+ /// We allow trivia into CustomCharacterClass, which could result in a CCC
1287+ /// that matches nothing, ie `(?x)[ ]`.
1288+ var guaranteesForwardProgress : Bool {
1289+ for m in members {
1290+ switch m {
1291+ case . trivia:
1292+ continue
1293+ case let . intersection( lhs, rhs) :
1294+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1295+ case let . subtraction( lhs, _) :
1296+ return lhs. guaranteesForwardProgress
1297+ case let . symmetricDifference( lhs, rhs) :
1298+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1299+ default :
1300+ return true
1301+ }
1302+ }
1303+ return false
1304+ }
1305+ }
0 commit comments