@@ -256,9 +256,11 @@ fileprivate extension Compiler.ByteCodeGen {
256256 }
257257 }
258258
259- mutating func emitAlternation(
260- _ children: [ DSLTree . Node ]
261- ) throws {
259+ mutating func emitAlternationGen< C: BidirectionalCollection > (
260+ _ elements: C ,
261+ withBacktracking: Bool ,
262+ _ body: ( inout Compiler . ByteCodeGen , C . Element ) throws -> Void
263+ ) rethrows {
262264 // Alternation: p0 | p1 | ... | pn
263265 // save next_p1
264266 // <code for p0>
@@ -276,16 +278,27 @@ fileprivate extension Compiler.ByteCodeGen {
276278 // <code for pn>
277279 // done:
278280 let done = builder. makeAddress ( )
279- for component in children . dropLast ( ) {
281+ for element in elements . dropLast ( ) {
280282 let next = builder. makeAddress ( )
281283 builder. buildSave ( next)
282- try emitNode ( component)
284+ try body ( & self , element)
285+ if !withBacktracking {
286+ builder. buildClear ( )
287+ }
283288 builder. buildBranch ( to: done)
284289 builder. label ( next)
285290 }
286- try emitNode ( children . last!)
291+ try body ( & self , elements . last!)
287292 builder. label ( done)
288293 }
294+
295+ mutating func emitAlternation(
296+ _ children: [ DSLTree . Node ]
297+ ) throws {
298+ try emitAlternationGen ( children, withBacktracking: true ) {
299+ try $0. emitNode ( $1)
300+ }
301+ }
289302
290303 mutating func emitConcatenationComponent(
291304 _ node: DSLTree . Node
@@ -846,19 +859,187 @@ fileprivate extension Compiler.ByteCodeGen {
846859 }
847860 }
848861
862+ /// Flatten quoted strings into sequences of atoms, so that the standard
863+ /// CCC codegen will handle them.
864+ func flatteningCustomCharacterClassMembers(
865+ _ members: [ DSLTree . CustomCharacterClass . Member ]
866+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
867+ var characters : Set < Character > = [ ]
868+ var scalars : Set < UnicodeScalar > = [ ]
869+ var result : [ DSLTree . CustomCharacterClass . Member ] = [ ]
870+ for member in members {
871+ switch member {
872+ case . atom( let atom) :
873+ switch atom {
874+ case let . char( char) :
875+ characters. insert ( char)
876+ case let . scalar( scalar) :
877+ scalars. insert ( scalar)
878+ default :
879+ result. append ( member)
880+ }
881+ case let . quotedLiteral( str) :
882+ characters. formUnion ( str)
883+ default :
884+ result. append ( member)
885+ }
886+ }
887+ result. append ( contentsOf: characters. map { . atom( . char( $0) ) } )
888+ result. append ( contentsOf: scalars. map { . atom( . scalar( $0) ) } )
889+ return result
890+ }
891+
849892 func coalescingCustomCharacterClass(
850893 _ ccc: DSLTree . CustomCharacterClass
851894 ) -> DSLTree . CustomCharacterClass {
852895 // This only needs to be done in grapheme semantic mode. In scalar semantic
853896 // mode, we don't want to coalesce any scalars into a grapheme. This
854897 // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
855898 // U+302.
856- guard options. semanticLevel == . graphemeCluster else { return ccc }
857-
858- let members = coalescingCustomCharacterClassMembers ( ccc. members)
859- return . init( members: members, isInverted: ccc. isInverted)
899+ let members = options. semanticLevel == . graphemeCluster
900+ ? coalescingCustomCharacterClassMembers ( ccc. members)
901+ : ccc. members
902+ return . init(
903+ members: flatteningCustomCharacterClassMembers ( members) ,
904+ isInverted: ccc. isInverted)
860905 }
861906
907+ mutating func emitCharacterInCCC( _ c: Character ) {
908+ switch options. semanticLevel {
909+ case . graphemeCluster:
910+ emitCharacter ( c)
911+ case . unicodeScalar:
912+ // When in scalar mode, act like an alternation of the individual scalars
913+ // that comprise a character.
914+ emitAlternationGen ( c. unicodeScalars, withBacktracking: false ) {
915+ $0. emitMatchScalar ( $1)
916+ }
917+ }
918+ }
919+
920+ mutating func emitCCCMember(
921+ _ member: DSLTree . CustomCharacterClass . Member
922+ ) throws {
923+ switch member {
924+ case . atom( let atom) :
925+ switch atom {
926+ case . char( let c) :
927+ emitCharacterInCCC ( c)
928+ case . scalar( let s) :
929+ emitCharacterInCCC ( Character ( s) )
930+ default :
931+ try emitAtom ( atom)
932+ }
933+ case . custom( let ccc) :
934+ try emitCustomCharacterClass ( ccc)
935+ case . quotedLiteral:
936+ fatalError ( " Removed in 'flatteningCustomCharacterClassMembers' " )
937+ case . range:
938+ let consumer = try member. generateConsumer ( options)
939+ builder. buildConsume ( by: consumer)
940+ case . trivia:
941+ return
942+
943+ // TODO: Can we decide when it's better to try `rhs` first?
944+ // Intersection is trivial, since failure on either side propagates:
945+ // - store current position
946+ // - lhs
947+ // - restore current position
948+ // - rhs
949+ case let . intersection( lhs, rhs) :
950+ let r = builder. makePositionRegister ( )
951+ builder. buildMoveCurrentPosition ( into: r)
952+ try emitCustomCharacterClass ( lhs)
953+ builder. buildRestorePosition ( from: r)
954+ try emitCustomCharacterClass ( rhs)
955+
956+ // TODO: Can we decide when it's better to try `rhs` first?
957+ // For subtraction, failure in `lhs` propagates, while failure in `rhs` is
958+ // swallowed/reversed:
959+ // - store current position
960+ // - lhs
961+ // - save to end
962+ // - restore current position
963+ // - rhs
964+ // - clear, fail (since both succeeded)
965+ // - end: ...
966+ case let . subtraction( lhs, rhs) :
967+ let r = builder. makePositionRegister ( )
968+ let end = builder. makeAddress ( )
969+ builder. buildMoveCurrentPosition ( into: r)
970+ try emitCustomCharacterClass ( lhs) // no match here = failure, propagates
971+ builder. buildSave ( end)
972+ builder. buildRestorePosition ( from: r)
973+ try emitCustomCharacterClass ( rhs) // no match here = success, resumes at 'end'
974+ builder. buildClear ( ) // clears 'end'
975+ builder. buildFail ( ) // this failure propagates outward
976+ builder. label ( end)
977+
978+ // Symmetric difference always requires executing both `rhs` and `lhs`.
979+ // Execute each, ignoring failure and storing the resulting position in a
980+ // register. If those results are equal, fail. If they're different, use
981+ // the position that is different from the starting position:
982+ // - store current position as r0
983+ // - save to lhsFail
984+ // - lhs
985+ // - clear lhsFail (and continue)
986+ // - lhsFail: save position as r1
987+ //
988+ // - restore current position
989+ // - save to rhsFail
990+ // - rhs
991+ // - clear rhsFail (and continue)
992+ // - rhsFail: save position as r2
993+ //
994+ // - restore to resulting position from lhs (r1)
995+ // - if equal to r2, goto fail (both sides had same result)
996+ // - if equal to r0, goto advance (lhs failed)
997+ // - goto end
998+ // - advance: restore to resulting position from rhs (r2)
999+ // - goto end
1000+ // - fail: fail
1001+ // - end: ...
1002+ case let . symmetricDifference( lhs, rhs) :
1003+ let r0 = builder. makePositionRegister ( )
1004+ let r1 = builder. makePositionRegister ( )
1005+ let r2 = builder. makePositionRegister ( )
1006+ let lhsFail = builder. makeAddress ( )
1007+ let rhsFail = builder. makeAddress ( )
1008+ let advance = builder. makeAddress ( )
1009+ let fail = builder. makeAddress ( )
1010+ let end = builder. makeAddress ( )
1011+
1012+ builder. buildMoveCurrentPosition ( into: r0)
1013+ builder. buildSave ( lhsFail)
1014+ try emitCustomCharacterClass ( lhs)
1015+ builder. buildClear ( )
1016+ builder. label ( lhsFail)
1017+ builder. buildMoveCurrentPosition ( into: r1)
1018+
1019+ builder. buildRestorePosition ( from: r0)
1020+ builder. buildSave ( rhsFail)
1021+ try emitCustomCharacterClass ( rhs)
1022+ builder. buildClear ( )
1023+ builder. label ( rhsFail)
1024+ builder. buildMoveCurrentPosition ( into: r2)
1025+
1026+ // If r1 == r2, then fail
1027+ builder. buildRestorePosition ( from: r1)
1028+ builder. buildCondBranch ( to: fail, ifSamePositionAs: r2)
1029+
1030+ // If r1 == r0, then move to r2 before ending
1031+ builder. buildCondBranch ( to: advance, ifSamePositionAs: r0)
1032+ builder. buildBranch ( to: end)
1033+ builder. label ( advance)
1034+ builder. buildRestorePosition ( from: r2)
1035+ builder. buildBranch ( to: end)
1036+
1037+ builder. label ( fail)
1038+ builder. buildFail ( )
1039+ builder. label ( end)
1040+ }
1041+ }
1042+
8621043 mutating func emitCustomCharacterClass(
8631044 _ ccc: DSLTree . CustomCharacterClass
8641045 ) throws {
@@ -876,8 +1057,67 @@ fileprivate extension Compiler.ByteCodeGen {
8761057 }
8771058 return
8781059 }
879- let consumer = try ccc. generateConsumer ( options)
880- builder. buildConsume ( by: consumer)
1060+
1061+ let updatedCCC : DSLTree . CustomCharacterClass
1062+ if optimizationsEnabled {
1063+ updatedCCC = ccc. coalescingASCIIMembers ( options)
1064+ } else {
1065+ updatedCCC = ccc
1066+ }
1067+ let filteredMembers = updatedCCC. members. filter ( { !$0. isOnlyTrivia} )
1068+
1069+ if updatedCCC. isInverted {
1070+ // inverted
1071+ // custom character class: p0 | p1 | ... | pn
1072+ // Try each member to make sure they all fail
1073+ // save next_p1
1074+ // <code for p0>
1075+ // clear, fail
1076+ // next_p1:
1077+ // save next_p2
1078+ // <code for p1>
1079+ // clear fail
1080+ // next_p2:
1081+ // save next_p...
1082+ // <code for p2>
1083+ // clear fail
1084+ // ...
1085+ // next_pn:
1086+ // save done
1087+ // <code for pn>
1088+ // clear fail
1089+ // done:
1090+ // step forward by 1
1091+ let done = builder. makeAddress ( )
1092+ for member in filteredMembers. dropLast ( ) {
1093+ let next = builder. makeAddress ( )
1094+ builder. buildSave ( next)
1095+ try emitCCCMember ( member)
1096+ builder. buildClear ( )
1097+ builder. buildFail ( )
1098+ builder. label ( next)
1099+ }
1100+ builder. buildSave ( done)
1101+ try emitCCCMember ( filteredMembers. last!)
1102+ builder. buildClear ( )
1103+ builder. buildFail ( )
1104+ builder. label ( done)
1105+
1106+ // Consume a single unit for the inverted ccc
1107+ switch options. semanticLevel {
1108+ case . graphemeCluster:
1109+ builder. buildAdvance ( 1 )
1110+ case . unicodeScalar:
1111+ builder. buildAdvanceUnicodeScalar ( 1 )
1112+ }
1113+ return
1114+ }
1115+ // non inverted CCC
1116+ // Custom character class: p0 | p1 | ... | pn
1117+ // Very similar to alternation, but we don't keep backtracking save points
1118+ try emitAlternationGen ( filteredMembers, withBacktracking: false ) {
1119+ try $0. emitCCCMember ( $1)
1120+ }
8811121 }
8821122
8831123 mutating func emitConcatenation( _ children: [ DSLTree . Node ] ) throws {
@@ -1014,6 +1254,12 @@ fileprivate extension Compiler.ByteCodeGen {
10141254}
10151255
10161256extension DSLTree . Node {
1257+ /// A Boolean value indicating whether this node advances the match position
1258+ /// on a successful match.
1259+ ///
1260+ /// For example, an alternation like `(a|b|c)` always advances the position
1261+ /// by a character, but `(a|b|)` has an empty branch, which matches without
1262+ /// advancing.
10171263 var guaranteesForwardProgress : Bool {
10181264 switch self {
10191265 case . orderedChoice( let children) :
@@ -1044,12 +1290,34 @@ extension DSLTree.Node {
10441290 case . consumer, . matcher:
10451291 // Allow zero width consumers and matchers
10461292 return false
1047- case . customCharacterClass:
1048- return true
1293+ case . customCharacterClass( let ccc ) :
1294+ return ccc . guaranteesForwardProgress
10491295 case . quantification( let amount, _, let child) :
10501296 let ( atLeast, _) = amount. ast. bounds
10511297 return atLeast ?? 0 > 0 && child. guaranteesForwardProgress
10521298 default : return false
10531299 }
10541300 }
10551301}
1302+
1303+ extension DSLTree . CustomCharacterClass {
1304+ /// We allow trivia into CustomCharacterClass, which could result in a CCC
1305+ /// that matches nothing, ie `(?x)[ ]`.
1306+ var guaranteesForwardProgress : Bool {
1307+ for m in members {
1308+ switch m {
1309+ case . trivia:
1310+ continue
1311+ case let . intersection( lhs, rhs) :
1312+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1313+ case let . subtraction( lhs, _) :
1314+ return lhs. guaranteesForwardProgress
1315+ case let . symmetricDifference( lhs, rhs) :
1316+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1317+ default :
1318+ return true
1319+ }
1320+ }
1321+ return false
1322+ }
1323+ }
0 commit comments