Skip to content

Commit cdff26b

Browse files
committed
Add list-based compilation
This implements bytecode generation from a DSLList instead of a DSLTree. The change includes tests that all regex patterns in the `MatchTests` file produce the exact same bytecode from a list as from a tree.
1 parent fa993aa commit cdff26b

File tree

8 files changed

+882
-49
lines changed

8 files changed

+882
-49
lines changed

Sources/_StringProcessing/ByteCodeGen+DSLList.swift

Lines changed: 723 additions & 0 deletions
Large diffs are not rendered by default.

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ extension Compiler {
2323
var hasEmittedFirstMatchableAtom = false
2424

2525
private let compileOptions: _CompileOptions
26-
fileprivate var optimizationsEnabled: Bool {
26+
internal var optimizationsEnabled: Bool {
2727
!compileOptions.contains(.disableOptimizations)
2828
}
2929

@@ -61,7 +61,7 @@ extension Compiler.ByteCodeGen {
6161
}
6262
}
6363

64-
fileprivate extension Compiler.ByteCodeGen {
64+
extension Compiler.ByteCodeGen {
6565
mutating func emitAtom(_ a: DSLTree.Atom) throws {
6666
defer {
6767
if a.isMatchable {

Sources/_StringProcessing/Compiler.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,20 @@ class Compiler {
4040
captureList: tree.captureList)
4141
return try codegen.emitRoot(tree.root)
4242
}
43+
44+
__consuming func emitViaList() throws -> MEProgram {
45+
// TODO: Handle global options
46+
let dslList = DSLList(tree: tree)
47+
for (n, el) in dslList.nodes.enumerated() {
48+
print("\(n): \(el)")
49+
}
50+
var codegen = ByteCodeGen(
51+
options: options,
52+
compileOptions:
53+
compileOptions,
54+
captureList: tree.captureList)
55+
return try codegen.emitRoot(dslList)
56+
}
4357
}
4458

4559
/// Hashable wrapper for `Any.Type`.

Sources/_StringProcessing/Regex/ASTConversion.swift

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,21 +20,6 @@ extension AST {
2020
extension AST.Node {
2121
/// Converts an AST node to a `convertedRegexLiteral` node.
2222
var dslTreeNode: DSLTree.Node {
23-
// func wrap(_ node: DSLTree.Node) -> DSLTree.Node {
24-
// switch node {
25-
// case .convertedRegexLiteral(let child, _):
26-
// // FIXME: DSL can have one item concats
27-
//// assertionFailure("Double wrapping?")
28-
// return child
29-
// default:
30-
// break
31-
// }
32-
// // TODO: Should we do this for the
33-
// // single-concatenation child too, or should?
34-
// // we wrap _that_?
35-
// return node
36-
// }
37-
3823
// Convert the top-level node without wrapping
3924
func convert() throws -> DSLTree.Node {
4025
switch self {
@@ -105,9 +90,7 @@ extension AST.Node {
10590
}
10691
}
10792

108-
// FIXME: make total function again
10993
let converted = try! convert()
110-
// return wrap(converted)
11194
return converted
11295
}
11396
}

Sources/_StringProcessing/Regex/DSLList.swift

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1+
//===----------------------------------------------------------------------===//
12
//
2-
// DSLList.swift
3-
// swift-experimental-string-processing
3+
// This source file is part of the Swift.org open source project
44
//
5-
// Created by Nate Cook on 9/25/25.
5+
// Copyright (c) 2025 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
67
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
711

812
struct DSLList {
913
var nodes: [DSLTree.Node]
@@ -16,8 +20,8 @@ struct DSLList {
1620
self.nodes = nodes
1721
}
1822

19-
init(root: DSLTree.Node) {
20-
self.nodes = Array(root)
23+
init(tree: DSLTree) {
24+
self.nodes = Array(tree.depthFirst)
2125
}
2226
}
2327

@@ -78,8 +82,8 @@ extension DSLTree.Node {
7882
}
7983
}
8084

81-
extension DSLTree.Node: Sequence {
82-
struct Iterator: Sequence, IteratorProtocol {
85+
extension DSLTree {
86+
struct DepthFirst: Sequence, IteratorProtocol {
8387
typealias Element = DSLTree.Node
8488
private var stack: [Frame]
8589
private let getChildren: (Element) -> [Element]
@@ -104,11 +108,24 @@ extension DSLTree.Node: Sequence {
104108
for child in top.children.reversed() {
105109
stack.append(Frame(node: child, children: getChildren(child)))
106110
}
107-
return top.node
111+
112+
// Since we coalesce the children before adding them to the stack,
113+
// we need an exact matching number of children in the list's
114+
// concatenation node, so that it can provide the correct component
115+
// count. This will go away/change when .concatenation only stores
116+
// a count.
117+
return switch top.node {
118+
case .concatenation:
119+
.concatenation(top.node.coalescedChildren)
120+
default:
121+
top.node
122+
}
108123
}
109124
}
110125

111-
func makeIterator() -> Iterator {
112-
Iterator(root: self, getChildren: { $0.children })
126+
var depthFirst: DepthFirst {
127+
DepthFirst(root: root, getChildren: {
128+
$0.coalescedChildren
129+
})
113130
}
114131
}

Sources/_StringProcessing/Regex/DSLTree.swift

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -393,16 +393,72 @@ extension DSLTree.Node {
393393
switch self {
394394

395395
case let .orderedChoice(v): return v
396-
case let .concatenation(v): return v
397-
396+
case let .concatenation(v): return v
397+
398398
case let .capture(_, _, n, _): return [n]
399399
case let .nonCapturingGroup(_, n): return [n]
400400
case let .quantification(_, _, n): return [n]
401401
case let .ignoreCapturesInTypedOutput(n): return [n]
402+
case let .limitCaptureNesting(n): return [n]
403+
404+
case let .conditional(_, t, f): return [t,f]
405+
406+
case .trivia, .empty, .quotedLiteral,
407+
.consumer, .matcher, .characterPredicate,
408+
.customCharacterClass, .atom:
409+
return []
410+
411+
case let .absentFunction(abs):
412+
return abs.ast.children.map(\.dslTreeNode)
413+
}
414+
}
415+
416+
public var coalescedChildren: [DSLTree.Node] {
417+
// Before converting a concatenation in a tree to list form, we need to
418+
// flatten out any nested concatenations, and coalesce any adjacent
419+
// characters and scalars, forming quoted literals of their contents,
420+
// over which we can perform grapheme breaking.
421+
422+
func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] {
423+
switch node {
424+
case .concatenation(let ch):
425+
return ch.flatMap(flatten)
426+
case .ignoreCapturesInTypedOutput(let n), .limitCaptureNesting(let n):
427+
return flatten(n)
428+
default:
429+
return [node]
430+
}
431+
}
402432

403-
case let .limitCaptureNesting(n):
404-
// This is a transparent wrapper
405-
return n.children
433+
switch self {
434+
case let .orderedChoice(v): return v
435+
case let .concatenation(v):
436+
let children = v
437+
.flatMap(flatten)
438+
.coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in
439+
switch node {
440+
case .atom(let a):
441+
guard let c = a.literalCharacterValue else { return false }
442+
str.append(c)
443+
return true
444+
case .quotedLiteral(let q):
445+
str += q
446+
return true
447+
case .trivia:
448+
// Trivia can be completely ignored if we've already coalesced
449+
// something.
450+
return !str.isEmpty
451+
default:
452+
return false
453+
}
454+
}
455+
return children
456+
457+
case let .capture(_, _, n, _): return [n]
458+
case let .nonCapturingGroup(_, n): return [n]
459+
case let .quantification(_, _, n): return [n]
460+
case let .ignoreCapturesInTypedOutput(n): return [n]
461+
case let .limitCaptureNesting(n): return [n]
406462

407463
case let .conditional(_, t, f): return [t,f]
408464

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,37 @@
1+
//===----------------------------------------------------------------------===//
12
//
2-
// DSLListTests.swift
3-
// swift-experimental-string-processing
3+
// This source file is part of the Swift.org open source project
44
//
5-
// Created by Nate Cook on 9/25/25.
5+
// Copyright (c) 2025 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
67
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
711

812
import Testing
913
@testable import _StringProcessing
1014

1115
@Suite
1216
struct DSLListTests {
13-
@Test(arguments: [(#/abc/#, 4), (#/a(?:b+)c*/#, 7)])
14-
func simple(regex: Regex<Substring>, nodeCount: Int) {
15-
let dslList = DSLList(root: regex.root)
17+
@available(macOS 9999, *)
18+
@Test(arguments: [
19+
(#/a/#, 2), // literal, a
20+
(#/abcd+/#, 5), // literal, concat, abc, quant, d
21+
(#/a(?:b+)c*/#, 8), // literal, concat, a, noncap grp, quant, b, quant, c
22+
])
23+
func convertedNodeCount(regex: Regex<Substring>, nodeCount: Int) {
24+
let dslList = DSLList(tree: regex.program.tree)
1625
#expect(dslList.nodes.count == nodeCount)
17-
for (i, node) in dslList.nodes.enumerated() {
18-
print(i, node)
19-
}
26+
}
27+
28+
@Test(arguments: [#/a|b/#, #/a+b?c/#, #/abc/#, #/a(?:b+)c*/#, #/;[\r\n]/#, #/(?=(?:[1-9]|(?:a|b)))/#])
29+
func compilationComparison(regex: Regex<Substring>) throws {
30+
let listCompiler = Compiler(tree: regex.program.tree)
31+
let listProgram = try listCompiler.emitViaList()
32+
let treeCompiler = Compiler(tree: regex.program.tree)
33+
let treeProgram = try treeCompiler.emit()
34+
35+
#expect(treeProgram.instructions == listProgram.instructions)
2036
}
2137
}

Tests/RegexTests/MatchTests.swift

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,34 @@ func _roundTripLiteral(
3737
return remadeRegex
3838
}
3939

40+
func _validateListCompilation<T>(
41+
_ regex: Regex<T>
42+
) throws -> Bool {
43+
let treeCompiler = Compiler(tree: regex.program.tree)
44+
let treeProgram = try treeCompiler.emit()
45+
let listCompiler = Compiler(tree: regex.program.tree)
46+
let listProgram = try listCompiler.emitViaList()
47+
return treeProgram.instructions == listProgram.instructions
48+
}
49+
4050
func _firstMatch(
4151
_ regexStr: String,
4252
input: String,
4353
validateOptimizations: Bool,
4454
semanticLevel: RegexSemanticLevel = .graphemeCluster,
45-
syntax: SyntaxOptions = .traditional
55+
syntax: SyntaxOptions = .traditional,
56+
file: StaticString = #file,
57+
line: UInt = #line
4658
) throws -> (String, [String?])? {
4759
var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel)
4860
let result = try regex.firstMatch(in: input)
49-
61+
62+
if try !_validateListCompilation(regex) {
63+
XCTFail(
64+
"List compilation failed for '\(regexStr)'",
65+
file: file, line: line)
66+
}
67+
5068
func validateSubstring(_ substringInput: Substring) throws {
5169
// Sometimes the characters we add to a substring merge with existing
5270
// string members. This messes up cross-validation, so skip the test.
@@ -105,14 +123,18 @@ func _firstMatch(
105123
For input '\(input)'
106124
Original: '\(regexStr)'
107125
_literalPattern: '\(roundTripRegex?._literalPattern ?? "<no pattern>")'
108-
""")
126+
""",
127+
file: file,
128+
line: line)
109129
case let (_, rtMatch?):
110130
XCTFail("""
111131
Incorrectly matched as '\(rtMatch)'
112132
For input '\(input)'
113133
Original: '\(regexStr)'
114134
_literalPattern: '\(roundTripRegex!._literalPattern!)'
115-
""")
135+
""",
136+
file: file,
137+
line: line)
116138
}
117139
}
118140

@@ -184,7 +206,8 @@ func flatCaptureTest(
184206
input: test,
185207
validateOptimizations: validateOptimizations,
186208
semanticLevel: semanticLevel,
187-
syntax: syntax
209+
syntax: syntax,
210+
file: file, line: line
188211
) else {
189212
if expect == nil {
190213
continue
@@ -303,7 +326,8 @@ func firstMatchTest(
303326
input: input,
304327
validateOptimizations: validateOptimizations,
305328
semanticLevel: semanticLevel,
306-
syntax: syntax)?.0
329+
syntax: syntax,
330+
file: file, line: line)?.0
307331

308332
if xfail {
309333
XCTAssertNotEqual(found, match, file: file, line: line)

0 commit comments

Comments
 (0)