Files
Odin/core/text/regex/compiler/compiler.odin
gingerBill 842cfee0f3 Change Odin's LICENSE to zlib from BSD 3-clause
This change was made in order to allow things produced with Odin and using Odin's core library, to not require the LICENSE to also be distributed alongside the binary form.
2025-10-28 14:38:25 +00:00

557 lines
15 KiB
Odin

package regex_compiler
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's license.
List of contributors:
Feoramund: Initial implementation.
*/
import "base:intrinsics"
import "core:text/regex/common"
import "core:text/regex/parser"
import "core:text/regex/tokenizer"
import "core:text/regex/virtual_machine"
import "core:unicode"
Token :: tokenizer.Token
Token_Kind :: tokenizer.Token_Kind
Tokenizer :: tokenizer.Tokenizer
Rune_Class_Range :: parser.Rune_Class_Range
Rune_Class_Data :: parser.Rune_Class_Data
Node :: parser.Node
Node_Rune :: parser.Node_Rune
Node_Rune_Class :: parser.Node_Rune_Class
Node_Wildcard :: parser.Node_Wildcard
Node_Concatenation :: parser.Node_Concatenation
Node_Alternation :: parser.Node_Alternation
Node_Repeat_Zero :: parser.Node_Repeat_Zero
Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
Node_Repeat_One :: parser.Node_Repeat_One
Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
Node_Repeat_N :: parser.Node_Repeat_N
Node_Optional :: parser.Node_Optional
Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
Node_Group :: parser.Node_Group
Node_Anchor :: parser.Node_Anchor
Node_Word_Boundary :: parser.Node_Word_Boundary
Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
Opcode :: virtual_machine.Opcode
Program :: [dynamic]Opcode
JUMP_SIZE :: size_of(Opcode) + 1 * size_of(u16)
SPLIT_SIZE :: size_of(Opcode) + 2 * size_of(u16)
Compiler :: struct {
flags: common.Flags,
class_data: [dynamic]Rune_Class_Data,
}
Error :: enum {
None,
Program_Too_Big,
Too_Many_Classes,
}
classes_are_exact :: proc(q, w: ^Rune_Class_Data) -> bool #no_bounds_check {
assert(q != nil)
assert(w != nil)
if q == w {
return true
}
if len(q.runes) != len(w.runes) || len(q.ranges) != len(w.ranges) {
return false
}
for r, i in q.runes {
if r != w.runes[i] {
return false
}
}
for r, i in q.ranges {
if r.lower != w.ranges[i].lower || r.upper != w.ranges[i].upper {
return false
}
}
return true
}
map_all_classes :: proc(tree: Node, collection: ^[dynamic]Rune_Class_Data) {
if tree == nil {
return
}
switch specific in tree {
case ^Node_Rune: break
case ^Node_Wildcard: break
case ^Node_Anchor: break
case ^Node_Word_Boundary: break
case ^Node_Match_All_And_Escape: break
case ^Node_Concatenation:
for subnode in specific.nodes {
map_all_classes(subnode, collection)
}
case ^Node_Repeat_Zero:
map_all_classes(specific.inner, collection)
case ^Node_Repeat_Zero_Non_Greedy:
map_all_classes(specific.inner, collection)
case ^Node_Repeat_One:
map_all_classes(specific.inner, collection)
case ^Node_Repeat_One_Non_Greedy:
map_all_classes(specific.inner, collection)
case ^Node_Repeat_N:
map_all_classes(specific.inner, collection)
case ^Node_Optional:
map_all_classes(specific.inner, collection)
case ^Node_Optional_Non_Greedy:
map_all_classes(specific.inner, collection)
case ^Node_Group:
map_all_classes(specific.inner, collection)
case ^Node_Alternation:
map_all_classes(specific.left, collection)
map_all_classes(specific.right, collection)
case ^Node_Rune_Class:
unseen := true
for &value in collection {
if classes_are_exact(&specific.data, &value) {
unseen = false
break
}
}
if unseen {
append(collection, specific.data)
}
}
}
append_raw :: #force_inline proc(code: ^Program, data: $T) {
// NOTE: This is system-dependent endian.
for b in transmute([size_of(T)]byte)data {
append(code, cast(Opcode)b)
}
}
inject_raw :: #force_inline proc(code: ^Program, start: int, data: $T) {
// NOTE: This is system-dependent endian.
for b, i in transmute([size_of(T)]byte)data {
inject_at(code, start + i, cast(Opcode)b)
}
}
@require_results
generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) {
if node == nil {
return
}
// NOTE: For Jump/Split arguments, we write as i16 and will reinterpret
// this later when relative jumps are turned into absolute jumps.
switch specific in node {
// Atomic Nodes:
case ^Node_Rune:
if .Unicode not_in c.flags || specific.data < unicode.MAX_LATIN1 {
append(&code, Opcode.Byte)
append(&code, cast(Opcode)specific.data)
} else {
append(&code, Opcode.Rune)
append_raw(&code, specific.data)
}
case ^Node_Rune_Class:
if specific.negating {
append(&code, Opcode.Rune_Class_Negated)
} else {
append(&code, Opcode.Rune_Class)
}
index := -1
for &data, i in c.class_data {
if classes_are_exact(&data, &specific.data) {
index = i
break
}
}
assert(index != -1, "Unable to find collected Rune_Class_Data index.")
append(&code, Opcode(index))
case ^Node_Wildcard:
append(&code, Opcode.Wildcard)
case ^Node_Anchor:
if .Multiline in c.flags {
if specific.start {
append(&code, Opcode.Assert_Start_Multiline)
} else {
append(&code, Opcode.Multiline_Open)
append(&code, Opcode.Multiline_Close)
}
} else {
if specific.start {
append(&code, Opcode.Assert_Start)
} else {
append(&code, Opcode.Assert_End)
}
}
case ^Node_Word_Boundary:
if specific.non_word {
append(&code, Opcode.Assert_Non_Word_Boundary)
} else {
append(&code, Opcode.Assert_Word_Boundary)
}
// Compound Nodes:
case ^Node_Group:
code = generate_code(c, specific.inner)
if specific.capture && .No_Capture not_in c.flags {
inject_at(&code, 0, Opcode.Save)
inject_at(&code, 1, Opcode(2 * specific.capture_id))
append(&code, Opcode.Save)
append(&code, Opcode(2 * specific.capture_id + 1))
}
case ^Node_Alternation:
left := generate_code(c, specific.left)
right := generate_code(c, specific.right)
left_len := len(left)
// Avoiding duplicate allocation by reusing `left`.
code = left
inject_at(&code, 0, Opcode.Split)
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + left_len + JUMP_SIZE))
append(&code, Opcode.Jump)
append_raw(&code, i16(len(right) + JUMP_SIZE))
for opcode in right {
append(&code, opcode)
}
case ^Node_Concatenation:
for subnode in specific.nodes {
subnode_code := generate_code(c, subnode)
for opcode in subnode_code {
append(&code, opcode)
}
}
case ^Node_Repeat_Zero:
code = generate_code(c, specific.inner)
original_len := len(code)
inject_at(&code, 0, Opcode.Split)
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len + JUMP_SIZE))
append(&code, Opcode.Jump)
append_raw(&code, i16(-original_len - SPLIT_SIZE))
case ^Node_Repeat_Zero_Non_Greedy:
code = generate_code(c, specific.inner)
original_len := len(code)
inject_at(&code, 0, Opcode.Split)
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len + JUMP_SIZE))
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
append(&code, Opcode.Jump)
append_raw(&code, i16(-original_len - SPLIT_SIZE))
case ^Node_Repeat_One:
code = generate_code(c, specific.inner)
original_len := len(code)
append(&code, Opcode.Split)
append_raw(&code, i16(-original_len))
append_raw(&code, i16(SPLIT_SIZE))
case ^Node_Repeat_One_Non_Greedy:
code = generate_code(c, specific.inner)
original_len := len(code)
append(&code, Opcode.Split)
append_raw(&code, i16(SPLIT_SIZE))
append_raw(&code, i16(-original_len))
case ^Node_Repeat_N:
inside := generate_code(c, specific.inner)
original_len := len(inside)
if specific.lower == specific.upper { // {N}
// e{N} ... evaluates to ... e^N
for i := 0; i < specific.upper; i += 1 {
for opcode in inside {
append(&code, opcode)
}
}
} else if specific.lower == -1 && specific.upper > 0 { // {,M}
// e{,M} ... evaluates to ... e?^M
for i := 0; i < specific.upper; i += 1 {
append(&code, Opcode.Split)
append_raw(&code, i16(SPLIT_SIZE))
append_raw(&code, i16(SPLIT_SIZE + original_len))
for opcode in inside {
append(&code, opcode)
}
}
} else if specific.lower >= 0 && specific.upper == -1 { // {N,}
// e{N,} ... evaluates to ... e^N e*
for i := 0; i < specific.lower; i += 1 {
for opcode in inside {
append(&code, opcode)
}
}
append(&code, Opcode.Split)
append_raw(&code, i16(SPLIT_SIZE))
append_raw(&code, i16(SPLIT_SIZE + original_len + JUMP_SIZE))
for opcode in inside {
append(&code, opcode)
}
append(&code, Opcode.Jump)
append_raw(&code, i16(-original_len - SPLIT_SIZE))
} else if specific.lower >= 0 && specific.upper > 0 {
// e{N,M} evaluates to ... e^N e?^(M-N)
for i := 0; i < specific.lower; i += 1 {
for opcode in inside {
append(&code, opcode)
}
}
for i := 0; i < specific.upper - specific.lower; i += 1 {
append(&code, Opcode.Split)
append_raw(&code, i16(SPLIT_SIZE + original_len))
append_raw(&code, i16(SPLIT_SIZE))
for opcode in inside {
append(&code, opcode)
}
}
} else {
panic("RegEx compiler received invalid repetition group.")
}
case ^Node_Optional:
code = generate_code(c, specific.inner)
original_len := len(code)
inject_at(&code, 0, Opcode.Split)
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len))
case ^Node_Optional_Non_Greedy:
code = generate_code(c, specific.inner)
original_len := len(code)
inject_at(&code, 0, Opcode.Split)
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len))
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
case ^Node_Match_All_And_Escape:
append(&code, Opcode.Match_All_And_Escape)
}
return
}
@require_results
compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: [dynamic]Rune_Class_Data, err: Error) {
if tree == nil {
if .No_Capture not_in flags {
append(&code, Opcode.Save); append(&code, Opcode(0x00))
append(&code, Opcode.Save); append(&code, Opcode(0x01))
append(&code, Opcode.Match)
} else {
append(&code, Opcode.Match_And_Exit)
}
return
}
c: Compiler
c.flags = flags
map_all_classes(tree, &class_data)
if len(class_data) >= common.MAX_CLASSES {
err = .Too_Many_Classes
return
}
c.class_data = class_data
code = generate_code(&c, tree)
pc_open := 0
optimize_opening: {
// Check if the opening to the pattern is predictable.
// If so, use one of the optimized Wait opcodes.
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
seek_loop: for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
#partial switch opcode {
case .Byte:
inject_at(&code, pc_open, Opcode.Wait_For_Byte)
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
break optimize_opening
case .Rune:
operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
inject_at(&code, pc_open, Opcode.Wait_For_Rune)
pc_open += size_of(Opcode)
inject_raw(&code, pc_open, operand)
pc_open += size_of(rune)
break optimize_opening
case .Rune_Class:
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class)
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
break optimize_opening
case .Rune_Class_Negated:
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated)
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
break optimize_opening
case .Save:
continue
case .Assert_Start, .Assert_Start_Multiline:
break optimize_opening
case:
break seek_loop
}
}
// `.*?`
inject_at(&code, pc_open, Opcode.Split)
pc_open += size_of(byte)
inject_raw(&code, pc_open, i16(SPLIT_SIZE + size_of(byte) + JUMP_SIZE))
pc_open += size_of(i16)
inject_raw(&code, pc_open, i16(SPLIT_SIZE))
pc_open += size_of(i16)
inject_at(&code, pc_open, Opcode.Wildcard)
pc_open += size_of(byte)
inject_at(&code, pc_open, Opcode.Jump)
pc_open += size_of(byte)
inject_raw(&code, pc_open, i16(-size_of(byte) - SPLIT_SIZE))
pc_open += size_of(i16)
}
if .No_Capture not_in flags {
// `(` <generated code>
inject_at(&code, pc_open, Opcode.Save)
inject_at(&code, pc_open + size_of(byte), Opcode(0x00))
// `)`
append(&code, Opcode.Save); append(&code, Opcode(0x01))
append(&code, Opcode.Match)
} else {
append(&code, Opcode.Match_And_Exit)
}
if len(code) >= common.MAX_PROGRAM_SIZE {
err = .Program_Too_Big
return
}
// NOTE: No further opcode addition beyond this point, as we've already
// checked the program size. Removal or transformation is fine.
// Post-Compile Optimizations:
// * Jump Extension
//
// A:RelJmp(1) -> B:RelJmp(2) => A:RelJmp(2)
if .No_Optimization not_in flags {
for passes_left := 1; passes_left > 0; passes_left -= 1 {
do_another_pass := false
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
#partial switch opcode {
case .Jump:
jmp := cast(^i16)&code[pc+size_of(Opcode)]
jmp_value := intrinsics.unaligned_load(jmp)
if code[cast(i16)pc+jmp_value] == .Jump {
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_value+size_of(Opcode)])
intrinsics.unaligned_store(jmp, jmp_value + next_jmp)
do_another_pass = true
}
case .Split:
jmp_x := cast(^i16)&code[pc+size_of(Opcode)]
jmp_x_value := intrinsics.unaligned_load(jmp_x)
if code[cast(i16)pc+jmp_x_value] == .Jump {
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_x_value+size_of(Opcode)])
intrinsics.unaligned_store(jmp_x, jmp_x_value + next_jmp)
do_another_pass = true
}
jmp_y := cast(^i16)&code[pc+size_of(Opcode)+size_of(i16)]
jmp_y_value := intrinsics.unaligned_load(jmp_y)
if code[cast(i16)pc+jmp_y_value] == .Jump {
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_y_value+size_of(Opcode)])
intrinsics.unaligned_store(jmp_y, jmp_y_value + next_jmp)
do_another_pass = true
}
}
}
if do_another_pass {
passes_left += 1
}
}
}
// * Relative Jump to Absolute Jump
//
// RelJmp{PC +/- N} => AbsJmp{M}
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
// NOTE: The virtual machine implementation depends on this.
#partial switch opcode {
case .Jump:
jmp := cast(^u16)&code[pc+size_of(Opcode)]
intrinsics.unaligned_store(jmp, intrinsics.unaligned_load(jmp) + cast(u16)pc)
case .Split:
jmp_x := cast(^u16)&code[pc+size_of(Opcode)]
intrinsics.unaligned_store(jmp_x, intrinsics.unaligned_load(jmp_x) + cast(u16)pc)
jmp_y := cast(^u16)&code[pc+size_of(Opcode)+size_of(i16)]
intrinsics.unaligned_store(jmp_y, intrinsics.unaligned_load(jmp_y) + cast(u16)pc)
}
}
return
}