@@ -828,19 +828,195 @@ fileprivate extension Compiler.ByteCodeGen {
828828 }
829829 }
830830
831+ /// Flatten quoted strings into sequences of atoms, so that the standard
832+ /// CCC codegen will handle them.
833+ func flatteningCustomCharacterClassMembers(
834+ _ members: [ DSLTree . CustomCharacterClass . Member ]
835+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
836+ var characters : Set < Character > = [ ]
837+ var scalars : Set < UnicodeScalar > = [ ]
838+ var result : [ DSLTree . CustomCharacterClass . Member ] = [ ]
839+ for member in members {
840+ switch member {
841+ case . atom( let atom) :
842+ switch atom {
843+ case let . char( char) :
844+ characters. insert ( char)
845+ case let . scalar( scalar) :
846+ scalars. insert ( scalar)
847+ default :
848+ result. append ( member)
849+ }
850+ case let . quotedLiteral( str) :
851+ characters. formUnion ( str)
852+ default :
853+ result. append ( member)
854+ }
855+ }
856+ result. append ( contentsOf: characters. map { . atom( . char( $0) ) } )
857+ result. append ( contentsOf: scalars. map { . atom( . scalar( $0) ) } )
858+ return result
859+ }
860+
831861 func coalescingCustomCharacterClass(
832862 _ ccc: DSLTree . CustomCharacterClass
833863 ) -> DSLTree . CustomCharacterClass {
834864 // This only needs to be done in grapheme semantic mode. In scalar semantic
835865 // mode, we don't want to coalesce any scalars into a grapheme. This
836866 // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
837867 // U+302.
838- guard options. semanticLevel == . graphemeCluster else { return ccc }
839-
840- let members = coalescingCustomCharacterClassMembers ( ccc. members)
841- return . init( members: members, isInverted: ccc. isInverted)
868+ let members = options. semanticLevel == . graphemeCluster
869+ ? coalescingCustomCharacterClassMembers ( ccc. members)
870+ : ccc. members
871+ return . init(
872+ members: flatteningCustomCharacterClassMembers ( members) ,
873+ isInverted: ccc. isInverted)
842874 }
843875
876+ mutating func emitCharacterInCCC( _ c: Character ) {
877+ switch options. semanticLevel {
878+ case . graphemeCluster:
879+ emitCharacter ( c)
880+ case . unicodeScalar:
881+ // When in scalar mode, act like an alternation of the individual scalars
882+ // that comprise a character.
883+ let done = builder. makeAddress ( )
884+ for scalar in c. unicodeScalars. dropLast ( ) {
885+ let next = builder. makeAddress ( )
886+ builder. buildSave ( next)
887+ emitMatchScalar ( scalar)
888+ builder. buildClear ( )
889+ builder. buildBranch ( to: done)
890+ builder. label ( next)
891+ }
892+ emitMatchScalar ( c. unicodeScalars. last!)
893+ builder. label ( done)
894+ }
895+ }
896+
897+ mutating func emitCCCMember(
898+ _ member: DSLTree . CustomCharacterClass . Member
899+ ) throws {
900+ switch member {
901+ case . atom( let atom) :
902+ switch atom {
903+ case . char( let c) :
904+ emitCharacterInCCC ( c)
905+ case . scalar( let s) :
906+ emitCharacterInCCC ( Character ( s) )
907+ default :
908+ try emitAtom ( atom)
909+ }
910+ case . custom( let ccc) :
911+ try emitCustomCharacterClass ( ccc)
912+ case . quotedLiteral:
913+ fatalError ( " Removed in 'flatteningCustomCharacterClassMembers' " )
914+ case . range:
915+ let consumer = try member. generateConsumer ( options)
916+ builder. buildConsume ( by: consumer)
917+ case . trivia:
918+ return
919+
920+ // TODO: Can we decide when it's better to try `rhs` first?
921+ // Intersection is trivial, since failure on either side propagates:
922+ // - store current position
923+ // - lhs
924+ // - restore current position
925+ // - rhs
926+ case let . intersection( lhs, rhs) :
927+ let r = builder. makePositionRegister ( )
928+ builder. buildMoveCurrentPosition ( into: r)
929+ try emitCustomCharacterClass ( lhs)
930+ builder. buildRestorePosition ( from: r)
931+ try emitCustomCharacterClass ( rhs)
932+
933+ // TODO: Can we decide when it's better to try `rhs` first?
934+ // For subtraction, failure in `lhs` propagates, while failure in `rhs` is
935+ // swallowed/reversed:
936+ // - store current position
937+ // - lhs
938+ // - save to end
939+ // - restore current position
940+ // - rhs
941+ // - clear, fail (since both succeeded)
942+ // - end: ...
943+ case let . subtraction( lhs, rhs) :
944+ let r = builder. makePositionRegister ( )
945+ let end = builder. makeAddress ( )
946+ builder. buildMoveCurrentPosition ( into: r)
947+ try emitCustomCharacterClass ( lhs) // no match here = failure, propagates
948+ builder. buildSave ( end)
949+ builder. buildRestorePosition ( from: r)
950+ try emitCustomCharacterClass ( rhs) // no match here = success, resumes at 'end'
951+ builder. buildClear ( ) // clears 'end'
952+ builder. buildFail ( ) // this failure propagates outward
953+ builder. label ( end)
954+
955+ // Symmetric difference always requires executing both `rhs` and `lhs`.
956+ // Execute each, ignoring failure and storing the resulting position in a
957+ // register. If those results are equal, fail. If they're different, use
958+ // the position that is different from the starting position:
959+ // - store current position as r0
960+ // - save to lhsFail
961+ // - lhs
962+ // - clear lhsFail (and continue)
963+ // - lhsFail: save position as r1
964+ //
965+ // - restore current position
966+ // - save to rhsFail
967+ // - rhs
968+ // - clear rhsFail (and continue)
969+ // - rhsFail: save position as r2
970+ //
971+ // - restore to resulting position from lhs (r1)
972+ // - if equal to r2, goto fail (both sides had same result)
973+ // - if equal to r0, goto advance (lhs failed)
974+ // - goto end
975+ // - advance: restore to resulting position from rhs (r2)
976+ // - goto end
977+ // - fail: fail
978+ // - end: ...
979+ case let . symmetricDifference( lhs, rhs) :
980+ let r0 = builder. makePositionRegister ( )
981+ let r1 = builder. makePositionRegister ( )
982+ let r2 = builder. makePositionRegister ( )
983+ let lhsFail = builder. makeAddress ( )
984+ let rhsFail = builder. makeAddress ( )
985+ let advance = builder. makeAddress ( )
986+ let fail = builder. makeAddress ( )
987+ let end = builder. makeAddress ( )
988+
989+ builder. buildMoveCurrentPosition ( into: r0)
990+ builder. buildSave ( lhsFail)
991+ try emitCustomCharacterClass ( lhs)
992+ builder. buildClear ( )
993+ builder. label ( lhsFail)
994+ builder. buildMoveCurrentPosition ( into: r1)
995+
996+ builder. buildRestorePosition ( from: r0)
997+ builder. buildSave ( rhsFail)
998+ try emitCustomCharacterClass ( rhs)
999+ builder. buildClear ( )
1000+ builder. label ( rhsFail)
1001+ builder. buildMoveCurrentPosition ( into: r2)
1002+
1003+ // If r1 == r2, then fail
1004+ builder. buildRestorePosition ( from: r1)
1005+ builder. buildCondBranch ( to: fail, ifSamePositionAs: r2)
1006+
1007+ // If r1 == r0, then move to r2 before ending
1008+ builder. buildCondBranch ( to: advance, ifSamePositionAs: r0)
1009+ builder. buildBranch ( to: end)
1010+ builder. label ( advance)
1011+ builder. buildRestorePosition ( from: r2)
1012+ builder. buildBranch ( to: end)
1013+
1014+ builder. label ( fail)
1015+ builder. buildFail ( )
1016+ builder. label ( end)
1017+ }
1018+ }
1019+
8441020 mutating func emitCustomCharacterClass(
8451021 _ ccc: DSLTree . CustomCharacterClass
8461022 ) throws {
@@ -858,8 +1034,93 @@ fileprivate extension Compiler.ByteCodeGen {
8581034 }
8591035 return
8601036 }
861- let consumer = try ccc. generateConsumer ( options)
862- builder. buildConsume ( by: consumer)
1037+
1038+ let updatedCCC : DSLTree . CustomCharacterClass
1039+ if optimizationsEnabled {
1040+ updatedCCC = ccc. coalescingASCIIMembers ( options)
1041+ } else {
1042+ updatedCCC = ccc
1043+ }
1044+ let filteredMembers = updatedCCC. members. filter ( { !$0. isOnlyTrivia} )
1045+
1046+ if updatedCCC. isInverted {
1047+ // inverted
1048+ // custom character class: p0 | p1 | ... | pn
1049+ // Try each member to make sure they all fail
1050+ // save next_p1
1051+ // <code for p0>
1052+ // clear, fail
1053+ // next_p1:
1054+ // save next_p2
1055+ // <code for p1>
1056+ // clear fail
1057+ // next_p2:
1058+ // save next_p...
1059+ // <code for p2>
1060+ // clear fail
1061+ // ...
1062+ // next_pn:
1063+ // save done
1064+ // <code for pn>
1065+ // clear fail
1066+ // done:
1067+ // step forward by 1
1068+ let done = builder. makeAddress ( )
1069+ for member in filteredMembers. dropLast ( ) {
1070+ let next = builder. makeAddress ( )
1071+ builder. buildSave ( next)
1072+ try emitCCCMember ( member)
1073+ builder. buildClear ( )
1074+ builder. buildFail ( )
1075+ builder. label ( next)
1076+ }
1077+ builder. buildSave ( done)
1078+ try emitCCCMember ( filteredMembers. last!)
1079+ builder. buildClear ( )
1080+ builder. buildFail ( )
1081+ builder. label ( done)
1082+
1083+ // Consume a single unit for the inverted ccc
1084+ switch options. semanticLevel {
1085+ case . graphemeCluster:
1086+ builder. buildAdvance ( 1 )
1087+ case . unicodeScalar:
1088+ builder. buildAdvanceUnicodeScalar ( 1 )
1089+ }
1090+ return
1091+ }
1092+ // non inverted CCC
1093+ // Custom character class: p0 | p1 | ... | pn
1094+ // Very similar to alternation, but we don't keep backtracking save points
1095+ // save next_p1
1096+ // <code for p0>
1097+ // clear
1098+ // branch done
1099+ // next_p1:
1100+ // save next_p2
1101+ // <code for p1>
1102+ // clear
1103+ // branch done
1104+ // next_p2:
1105+ // save next_p...
1106+ // <code for p2>
1107+ // clear
1108+ // branch done
1109+ // ...
1110+ // next_pn:
1111+ // <code for pn>
1112+ // done:
1113+ let done = builder. makeAddress ( )
1114+ for member in filteredMembers. dropLast ( ) {
1115+ let next = builder. makeAddress ( )
1116+ builder. buildSave ( next)
1117+ try emitCCCMember ( member)
1118+ builder. buildClear ( )
1119+ builder. buildBranch ( to: done)
1120+ builder. label ( next)
1121+ }
1122+ try emitCCCMember ( filteredMembers. last!)
1123+ builder. label ( done)
8631124 }
8641125
8651126 mutating func emitConcatenation( _ children: [ DSLTree . Node ] ) throws {
@@ -996,6 +1257,12 @@ fileprivate extension Compiler.ByteCodeGen {
9961257}
9971258
9981259extension DSLTree . Node {
1260+ /// A Boolean value indicating whether this node advances the match position
1261+ /// on a successful match.
1262+ ///
1263+ /// For example, an alternation like `(a|b|c)` always advances the position
1264+ /// by a character, but `(a|b|)` has an empty branch, which matches without
1265+ /// advancing.
9991266 var guaranteesForwardProgress : Bool {
10001267 switch self {
10011268 case . orderedChoice( let children) :
@@ -1026,12 +1293,34 @@ extension DSLTree.Node {
10261293 case . consumer, . matcher:
10271294 // Allow zero width consumers and matchers
10281295 return false
1029- case . customCharacterClass:
1030- return true
1296+ case . customCharacterClass( let ccc ) :
1297+ return ccc . guaranteesForwardProgress
10311298 case . quantification( let amount, _, let child) :
10321299 let ( atLeast, _) = amount. ast. bounds
10331300 return atLeast ?? 0 > 0 && child. guaranteesForwardProgress
10341301 default : return false
10351302 }
10361303 }
10371304}
1305+
1306+ extension DSLTree . CustomCharacterClass {
1307+ /// We allow trivia into CustomCharacterClass, which could result in a CCC
1308+ /// that matches nothing, ie `(?x)[ ]`.
1309+ var guaranteesForwardProgress : Bool {
1310+ for m in members {
1311+ switch m {
1312+ case . trivia:
1313+ continue
1314+ case let . intersection( lhs, rhs) :
1315+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1316+ case let . subtraction( lhs, _) :
1317+ return lhs. guaranteesForwardProgress
1318+ case let . symmetricDifference( lhs, rhs) :
1319+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1320+ default :
1321+ return true
1322+ }
1323+ }
1324+ return false
1325+ }
1326+ }
0 commit comments