diff --git a/flake.lock b/flake.lock index dcf837fa..01cb7067 100644 --- a/flake.lock +++ b/flake.lock @@ -8,11 +8,11 @@ "rust-analyzer-src": "rust-analyzer-src" }, "locked": { - "lastModified": 1770708269, - "narHash": "sha256-OnZW86app7hHJJoB5lC9GNXY5QBBIESJB+sIdwEyld0=", + "lastModified": 1778493576, + "narHash": "sha256-/vvNyF8C2tNTkxtffGUQbcTJvf72cRw3qo8cyBh33pM=", "owner": "nix-community", "repo": "fenix", - "rev": "6b5325a017a9a9fe7e6252ccac3680cc7181cd63", + "rev": "5bf88a04d8678c7334f2f5072975f3b2cb0fe1ba", "type": "github" }, "original": { @@ -55,24 +55,6 @@ "type": "github" } }, - "flake-utils_2": { - "inputs": { - "systems": "systems_2" - }, - "locked": { - "lastModified": 1705309234, - "narHash": "sha256-uNRRNRKmJyCRC/8y1RqBkqWBLM034y4qN7EprSdmgyA=", - "owner": "numtide", - "repo": "flake-utils", - "rev": "1ef2e671c3b0c19053962c07dbda38332dcebf26", - "type": "github" - }, - "original": { - "owner": "numtide", - "repo": "flake-utils", - "type": "github" - } - }, "gitignore": { "inputs": { "nixpkgs": [ @@ -96,16 +78,16 @@ }, "nixpkgs": { "locked": { - "lastModified": 1768649915, - "narHash": "sha256-jc21hKogFnxU7KXSVTRmxC7u5D4RHwm9BAvDf5/Z1Uo=", + "lastModified": 1778003029, + "narHash": "sha256-q/nkKLDtHIyLjZpKhWk3cSK5IYsFqtMd6UtXF3ddjgA=", "owner": "nixos", "repo": "nixpkgs", - "rev": "3e3f3c7f9977dc123c23ee21e8085ed63daf8c37", + "rev": "0c88e1f2bdb93d5999019e99cb0e61e1fe2af4c5", "type": "github" }, "original": { "owner": "nixos", - "ref": "release-25.05", + "ref": "nixos-25.11", "repo": "nixpkgs", "type": "github" } @@ -122,11 +104,11 @@ "rust-analyzer-src": { "flake": false, "locked": { - "lastModified": 1770668050, - "narHash": "sha256-Q05yaIZtQrBKHpyWaPmyJmDRj0lojnVf8nUFE0vydcY=", + "lastModified": 1778424672, + "narHash": "sha256-v/CZ9tJT+ulSe3ZmjuG3lWABwOvITbT7EqF/2NAl3Hs=", "owner": "rust-lang", "repo": "rust-analyzer", - "rev": "9efc1f709f3c8134c3acac5d3592a8e4c184a0c6", + "rev": "e266f5cab8f6525d0bc2ddccc0006418c534b5e6", "type": "github" }, "original": { @@ -152,6 +134,7 @@ } }, "systems_2": { + "flake": false, "locked": { "lastModified": 1681028828, "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", @@ -169,17 +152,17 @@ "zigPkgs": { "inputs": { "flake-compat": "flake-compat", - "flake-utils": "flake-utils_2", "nixpkgs": [ "nixpkgs" - ] + ], + "systems": "systems_2" }, "locked": { - "lastModified": 1770598090, - "narHash": "sha256-k+82IDgTd9o5sxHIqGlvfwseKln3Ejx1edGtDltuPXo=", + "lastModified": 1778375309, + "narHash": "sha256-3+5C2LDX1lmupM6ktG6i50BRvRnN32WLinpxqa2g+HQ=", "owner": "mitchellh", "repo": "zig-overlay", - "rev": "142495696982c88edddc8e17e4da90d8164acadf", + "rev": "057bcab6a8e6a3a85e9293e150d35c63404e8fca", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index d306ae09..ffa96a97 100644 --- a/flake.nix +++ b/flake.nix @@ -2,7 +2,7 @@ description = "headless browser designed for AI and automation"; inputs = { - nixpkgs.url = "github:nixos/nixpkgs/release-25.05"; + nixpkgs.url = "github:nixos/nixpkgs/nixos-25.11"; zigPkgs.url = "github:mitchellh/zig-overlay"; zigPkgs.inputs.nixpkgs.follows = "nixpkgs"; diff --git a/src/Config.zig b/src/Config.zig index 2e5bcdb3..a4e9afff 100644 --- a/src/Config.zig +++ b/src/Config.zig @@ -143,6 +143,24 @@ fn waitScriptFileValidator(allocator: Allocator, args: *std.process.ArgIterator) }; } +fn injectScriptFileValidator( + allocator: Allocator, + args: *std.process.ArgIterator, + list: *std.ArrayList([]const u8), +) !void { + const path = args.next() orelse { + log.fatal(.app, "missing argument value", .{ .arg = "--inject-script-file" }); + return error.InvalidArgument; + }; + + const bytes = std.fs.cwd().readFileAllocOptions(allocator, path, std.math.maxInt(usize), null, .of(u8), null) catch |err| { + log.fatal(.app, "failed to read file", .{ .arg = "--inject-script-file", .path = path, .err = err }); + return error.InvalidArgument; + }; + + return list.append(allocator, bytes); +} + /// Definition for all the commands and its arguments. See @cli.zig for further. const Commands = cli.Builder(.{ .{ @@ -176,6 +194,14 @@ const Commands = cli.Builder(.{ }, }, .{ .name = "wait_selector", .type = ?[:0]const u8 }, + .{ + .name = "inject_script", + .type = []const u8, + .multiple = true, + .variants = .{ + .{ .name = "inject_script_file", .validator = injectScriptFileValidator }, + }, + }, .{ .name = "terminate_ms", .type = ?u32 }, }, .shared_options = CommonOptions, @@ -688,6 +714,15 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void { \\--wait-script-file \\ Like --wait-script, but reads the script from a file. \\ + \\--inject-script JavaScript to execute as the document's is + \\ parsed, before any other scripts in the page run. + \\ Can be passed multiple times; scripts run in order. + \\ + \\--inject-script-file + \\ Like --inject-script, but reads the script from a file. + \\ Can be passed multiple times; can be mixed with + \\ --inject-script and runs in CLI order. + \\ \\--terminate-ms Hard deadline in milliseconds. After this time elapses, \\ JavaScript execution is forcibly terminated (e.g. for \\ pages with endless scripts). Unlike --wait-ms, which diff --git a/src/browser/Frame.zig b/src/browser/Frame.zig index 29c70aaf..78d64a83 100644 --- a/src/browser/Frame.zig +++ b/src/browser/Frame.zig @@ -1815,26 +1815,12 @@ pub fn notifyNetworkAlmostIdle(self: *Frame) void { }); } -// called from the parser -pub fn appendNew(self: *Frame, parent: *Node, child: Node.NodeOrText) !void { - const node = switch (child) { - .node => |n| n, - .text => |txt| blk: { - // If we're appending this adjacently to a text node, we should merge - if (parent.lastChild()) |sibling| { - if (sibling.is(CData.Text)) |tn| { - const cdata = tn._proto; - const existing = cdata.getData().str(); - cdata._data = try String.concat(self.arena, &.{ existing, txt }); - return; - } - } - break :blk try self.createTextNode(txt); - }, - }; - - lp.assert(node._parent == null, "Frame.appendNew", .{}); - try self._insertNodeRelative(true, parent, node, .append, .{ +// called from the parser. Text-node merging is the parser's responsibility +// (see Parser.appendTextChunk in src/browser/parser/Parser.zig); this is the +// "insert this fully-formed node as a new last child of parent" entry point. +pub fn appendNew(self: *Frame, parent: *Node, child: *Node) !void { + lp.assert(child._parent == null, "Frame.appendNew", .{}); + try self._insertNodeRelative(true, parent, child, .append, .{ // this opts has no meaning since we're passing `true` as the first // parameter, which indicates this comes from the parser, and has its // own special processing. Still, set it to be clear. @@ -2139,12 +2125,35 @@ pub fn createElementNS(self: *Frame, namespace: Element.Namespace, name: []const attribute_iterator, .{ ._proto = undefined }, ), - asUint("head") => return self.createHtmlElementT( - Element.Html.Head, - namespace, - attribute_iterator, - .{ ._proto = undefined }, - ), + asUint("head") => { + // Inject user-provided scripts. + const inject_scripts = self._session.inject_scripts; + const should_inject_scripts = from_parser and self._parse_mode == .document and inject_scripts.len > 0; + + if (should_inject_scripts) { + var ls: JS.Local.Scope = undefined; + self.js.localScope(&ls); + defer ls.deinit(); + + for (inject_scripts) |inject_script| { + var try_catch: JS.TryCatch = undefined; + try_catch.init(&ls.local); + defer try_catch.deinit(); + + ls.local.eval(inject_script, "inject_script") catch |err| { + const caught = try_catch.caughtOrError(self.call_arena, err); + log.err(.app, "inject script error", .{ .err = caught }); + }; + } + } + + return self.createHtmlElementT( + Element.Html.Head, + namespace, + attribute_iterator, + .{ ._proto = undefined }, + ); + }, asUint("body") => return self.createHtmlElementT( Element.Html.Body, namespace, @@ -4121,6 +4130,12 @@ test "WebApi: Integration" { try testing.htmlRunner("integration", .{}); } +test "WebApi: inject_script" { + try testing.htmlRunner("inject_script.html", .{ + .inject_script = "window.__injected = true; window.__injectValue = 42;", + }); +} + test "Page: isSameOrigin" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); diff --git a/src/browser/Session.zig b/src/browser/Session.zig index faf86ad2..2d27a396 100644 --- a/src/browser/Session.zig +++ b/src/browser/Session.zig @@ -57,6 +57,8 @@ navigation: Navigation, storage_shed: storage.Shed, notification: *Notification, cookie_jar: storage.Cookie.Jar, +/// User-provided scripts to inject into header. +inject_scripts: []const []const u8 = &.{}, // Shared allocator. Used by Session itself and borrowed by Pages. arena_pool: *ArenaPool, diff --git a/src/browser/js/bridge.zig b/src/browser/js/bridge.zig index 366f83af..9761540b 100644 --- a/src/browser/js/bridge.zig +++ b/src/browser/js/bridge.zig @@ -935,6 +935,9 @@ pub const PageJsApis = flattenTypes(&.{ @import("../webapi/CryptoKey.zig"), @import("../webapi/Selection.zig"), @import("../webapi/ImageData.zig"), + @import("../webapi/XPathResult.zig"), + @import("../webapi/XPathExpression.zig"), + @import("../webapi/XPathEvaluator.zig"), }); // APIs available on Worker context globals (constructors like URL, Headers, etc.) diff --git a/src/browser/parser/Parser.zig b/src/browser/parser/Parser.zig index 44756aed..fe6e1219 100644 --- a/src/browser/parser/Parser.zig +++ b/src/browser/parser/Parser.zig @@ -23,6 +23,7 @@ const h5e = @import("html5ever.zig"); const Frame = @import("../Frame.zig"); const Node = @import("../webapi/Node.zig"); const Element = @import("../webapi/Element.zig"); +const CData = @import("../webapi/CData.zig"); pub const AttributeIterator = h5e.AttributeIterator; @@ -39,6 +40,18 @@ pub const ParsedNode = struct { data: ?*anyopaque, }; +// html5ever's tokenizer flushes the script-data character buffer on every '<' +// (script-data-less-than-sign-state transition), which produces a separate +// AppendText callback per chunk. Merging via String.concat in the previous +// implementation was O(N^2/chunk_size) on the page-lifetime arena, blowing +// memory on inline JS that contains embedded HTML strings (issue #2397). +// Instead, we keep a single Parser-level buf and accumulate same-parent +// chunks into it, committing once on flush. +const PendingText = struct { + parent: *Node, + text_node: *CData, +}; + const Parser = @This(); frame: *Frame, @@ -46,6 +59,17 @@ err: ?Error, container: ParsedNode, arena: Allocator, strings: std.StringHashMapUnmanaged(void), +pending_text: ?PendingText, +// One buffer reused across every text run in this parser. clearRetainingCapacity +// on flush keeps the largest capacity ever needed, so total dead memory on the +// parser arena is bounded to one peak-run-sized allocation regardless of how +// many text runs the parse contains. Matters for Streaming, whose arena is the +// page-lifetime frame.arena (individual frees are no-ops there). +// +// Single-chunk text runs leave this buf empty: the chunk lives only in +// CData._data via createTextNode. The buf is seeded from _data.str() on the +// second chunk of a run, so the common case stays at one copy. +buf: std.ArrayList(u8), pub fn init(arena: Allocator, node: *Node, frame: *Frame) Parser { return .{ @@ -57,6 +81,62 @@ pub fn init(arena: Allocator, node: *Node, frame: *Frame) Parser { .data = null, .node = node, }, + .pending_text = null, + .buf = .empty, + }; +} + +pub fn flushPendingText(self: *Parser) !void { + const pt = self.pending_text orelse return; + self.pending_text = null; + // Single-chunk run: data already lives on _data via createTextNode. + if (self.buf.items.len == 0) return; + defer self.buf.clearRetainingCapacity(); + pt.text_node._data = try lp.String.init( + self.frame.arena, + self.buf.items, + .{ .dupe = true }, + ); +} + +fn appendTextChunk(self: *Parser, parent: *Node, txt: []const u8) !void { + if (self.pending_text) |pt| { + if (pt.parent == parent and parent.lastChild() == pt.text_node.asNode()) { + // Second+ chunk of the same run. If buf is still empty, promote + // from the single-chunk fast path by seeding from _data first. + if (self.buf.items.len == 0) { + const existing = pt.text_node.getData().str(); + try self.buf.ensureTotalCapacity(self.arena, existing.len + txt.len); + self.buf.appendSliceAssumeCapacity(existing); + } + try self.buf.appendSlice(self.arena, txt); + return; + } + try self.flushPendingText(); + } + + if (parent.lastChild()) |sibling| { + if (sibling.is(CData.Text)) |tn| { + // Existing text sibling without a matching pending_text. Seed the + // buf from its _data and register pending so subsequent chunks + // accumulate cheaply. + const cdata = tn._proto; + const existing = cdata.getData().str(); + try self.buf.ensureTotalCapacity(self.arena, existing.len + txt.len); + self.buf.appendSliceAssumeCapacity(existing); + self.buf.appendSliceAssumeCapacity(txt); + self.pending_text = .{ .parent = parent, .text_node = cdata }; + return; + } + } + + // Fresh text run: the first chunk lives on _data only. buf stays empty + // until (and unless) a second chunk arrives. + const new_text = try self.frame.createTextNode(txt); + try self.frame.appendNew(parent, new_text); + self.pending_text = .{ + .parent = parent, + .text_node = new_text.is(CData.Text).?._proto, }; } @@ -101,6 +181,9 @@ pub fn parse(self: *Parser, html: []const u8) void { appendBeforeSiblingCallback, appendBasedOnParentNodeCallback, ); + self.flushPendingText() catch |err| { + if (self.err == null) self.err = .{ .err = err, .source = .append }; + }; } /// Parse HTML with encoding conversion. Converts from charset to UTF-8 before parsing. @@ -127,6 +210,9 @@ pub fn parseWithEncoding(self: *Parser, html: []const u8, charset: []const u8) v appendBeforeSiblingCallback, appendBasedOnParentNodeCallback, ); + self.flushPendingText() catch |err| { + if (self.err == null) self.err = .{ .err = err, .source = .append }; + }; } pub fn parseXML(self: *Parser, xml: []const u8) void { @@ -150,6 +236,9 @@ pub fn parseXML(self: *Parser, xml: []const u8) void { appendBeforeSiblingCallback, appendBasedOnParentNodeCallback, ); + self.flushPendingText() catch |err| { + if (self.err == null) self.err = .{ .err = err, .source = .append }; + }; } pub fn parseFragment(self: *Parser, html: []const u8) void { @@ -173,6 +262,9 @@ pub fn parseFragment(self: *Parser, html: []const u8) void { appendBeforeSiblingCallback, appendBasedOnParentNodeCallback, ); + self.flushPendingText() catch |err| { + if (self.err == null) self.err = .{ .err = err, .source = .append }; + }; } pub const Streaming = struct { @@ -233,8 +325,16 @@ pub const Streaming = struct { } } - pub fn done(self: *Streaming) void { - h5e.html5ever_streaming_parser_finish(self.handle.?); + pub fn done(self: *Streaming) !void { + // Null the handle before finish() so a flushPendingText failure can't + // leave a finished-but-still-referenced handle behind for deinit to + // double-free. flushPendingText doesn't touch the html5ever handle — + // it only reads pending_text and writes to a text node's _data — so + // running it after finish is safe. + const handle = self.handle.?; + self.handle = null; + h5e.html5ever_streaming_parser_finish(handle); + try self.parser.flushPendingText(); } }; @@ -252,6 +352,9 @@ fn popCallback(ctx: *anyopaque, node_ref: *anyopaque) callconv(.c) void { } fn _popCallback(self: *Parser, node: *Node) !void { + // Flush before any nodeComplete so Build.complete (and any custom-element + // callbacks reachable from it) observe the final text data. + try self.flushPendingText(); try self.frame.nodeComplete(node); } @@ -340,7 +443,7 @@ fn _appendDoctypeToDocument(self: *Parser, name: []const u8, public_id: []const }); // Append it to the document - try frame.appendNew(self.container.node, .{ .node = doctype.asNode() }); + try frame.appendNew(self.container.node, doctype.asNode()); } fn addAttrsIfMissingCallback(ctx: *anyopaque, target_ref: *anyopaque, attributes: h5e.AttributeIterator) callconv(.c) void { @@ -402,6 +505,10 @@ fn _appendCallback(self: *Parser, parent: *Node, node_or_text: h5e.NodeOrText) ! // child node is guaranteed not to belong to another parent switch (node_or_text.toUnion()) { .node => |cpn| { + // Inserting a non-text child terminates any pending text run; flush + // before the insertion so that connectedCallback (etc.) sees the + // final data on the preceding text sibling. + try self.flushPendingText(); const child = getNode(cpn); if (child._parent) |previous_parent| { // html5ever says this can't happen, but we might be screwing up @@ -414,9 +521,9 @@ fn _appendCallback(self: *Parser, parent: *Node, node_or_text: h5e.NodeOrText) ! } self.frame.removeNode(previous_parent, child, .{ .will_be_reconnected = parent.isConnected() }); } - try self.frame.appendNew(parent, .{ .node = child }); + try self.frame.appendNew(parent, child); }, - .text => |txt| try self.frame.appendNew(parent, .{ .text = txt }), + .text => |txt| try self.appendTextChunk(parent, txt), } } @@ -427,6 +534,11 @@ fn removeFromParentCallback(ctx: *anyopaque, target_ref: *anyopaque) callconv(.c }; } fn _removeFromParentCallback(self: *Parser, node: *Node) !void { + // Removing a node mid-parse can detach the pending text node or its + // parent; either way the pending invariant breaks. Flush first so the + // accumulated bytes land on a still-attached text node (and pending_text + // is cleared before any subsequent chunk targets a fresh node). + try self.flushPendingText(); const parent = node.parentNode() orelse return; _ = try parent.removeChild(node, self.frame); } @@ -438,6 +550,10 @@ fn reparentChildrenCallback(ctx: *anyopaque, node_ref: *anyopaque, new_parent_re }; } fn _reparentChildrenCallback(self: *Parser, node: *Node, new_parent: *Node) !void { + // Reparenting can move the pending text node out from under us — the + // node's _parent changes but pending_text.parent does not. Flush so the + // accumulator commits before the tree is rearranged. + try self.flushPendingText(); try self.frame.appendAllChildren(node, new_parent); } @@ -448,6 +564,10 @@ fn appendBeforeSiblingCallback(ctx: *anyopaque, sibling_ref: *anyopaque, node_or }; } fn _appendBeforeSiblingCallback(self: *Parser, sibling: *Node, node_or_text: h5e.NodeOrText) !void { + // Foster parenting / before-sibling insertions interrupt any pending text + // run (the new node lands at a different position from the pending text's + // tail). Flush before reading the parent's structure. + try self.flushPendingText(); const parent = sibling.parentNode() orelse return error.NoParent; const node: *Node = switch (node_or_text.toUnion()) { .node => |cpn| blk: { diff --git a/src/browser/tests/cdata/raw_text_chunked.html b/src/browser/tests/cdata/raw_text_chunked.html new file mode 100644 index 00000000..8a5503a4 --- /dev/null +++ b/src/browser/tests/cdata/raw_text_chunked.html @@ -0,0 +1,128 @@ + + + + + + + + + + + + + +A page <with> many <tags> in <the> title for testing + + diff --git a/src/browser/tests/cdp/perform_search_xpath.html b/src/browser/tests/cdp/perform_search_xpath.html new file mode 100644 index 00000000..e30ca1c1 --- /dev/null +++ b/src/browser/tests/cdp/perform_search_xpath.html @@ -0,0 +1,8 @@ + + +
+

1

+

2

+
+

3

+ diff --git a/src/browser/tests/document/document.html b/src/browser/tests/document/document.html index ede2b507..eb69f5d8 100644 --- a/src/browser/tests/document/document.html +++ b/src/browser/tests/document/document.html @@ -380,6 +380,53 @@ testing.expectEqual(0, nd.childElementCount); + + diff --git a/src/browser/tests/element/html/link.html b/src/browser/tests/element/html/link.html index 57d9e28b..9f4dd6a8 100644 --- a/src/browser/tests/element/html/link.html +++ b/src/browser/tests/element/html/link.html @@ -1,6 +1,32 @@ + + + + + @@ -21,9 +21,9 @@ const container = $('#container'); // Invalid nth patterns - testing.expectError("Error: InvalidNthPattern", () => container.querySelector(':nth-child(foo)')); - testing.expectError("Error: InvalidNthPattern", () => container.querySelector(':nth-child(-)')); - testing.expectError("Error: InvalidNthPattern", () => container.querySelector(':nth-child(+)')); + testing.expectError("SyntaxError", () => container.querySelector(':nth-child(foo)')); + testing.expectError("SyntaxError", () => container.querySelector(':nth-child(-)')); + testing.expectError("SyntaxError", () => container.querySelector(':nth-child(+)')); } @@ -32,9 +32,9 @@ const container = $('#container'); // Unknown pseudo-classes - testing.expectError("Error: UnknownPseudoClass", () => container.querySelector(':unknown')); - testing.expectError("Error: UnknownPseudoClass", () => container.querySelector(':not-a-real-pseudo')); - testing.expectError("Error: UnknownPseudoClass", () => container.querySelector(':fake(test)')); + testing.expectError("SyntaxError", () => container.querySelector(':unknown')); + testing.expectError("SyntaxError", () => container.querySelector(':not-a-real-pseudo')); + testing.expectError("SyntaxError", () => container.querySelector(':fake(test)')); } @@ -53,8 +53,8 @@ const container = $('#container'); // Combinators with nothing after - testing.expectError("Error: InvalidSelector", () => container.querySelector('p >')); - testing.expectError("Error: InvalidSelector", () => container.querySelector('p +')); - testing.expectError("Error: InvalidSelector", () => container.querySelector('p ~')); + testing.expectError("SyntaxError", () => container.querySelector('p >')); + testing.expectError("SyntaxError", () => container.querySelector('p +')); + testing.expectError("SyntaxError", () => container.querySelector('p ~')); } diff --git a/src/browser/tests/inject_script.html b/src/browser/tests/inject_script.html new file mode 100644 index 00000000..f7917a54 --- /dev/null +++ b/src/browser/tests/inject_script.html @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/src/browser/tests/page/meta.html b/src/browser/tests/page/meta.html index 3c03f403..98fb1688 100644 --- a/src/browser/tests/page/meta.html +++ b/src/browser/tests/page/meta.html @@ -30,7 +30,8 @@ testing.expectEqual('undefined', typeof plainDoc.scripts); testing.expectEqual('undefined', typeof plainDoc.links); testing.expectEqual('undefined', typeof plainDoc.forms); - testing.expectEqual('undefined', typeof plainDoc.location); + // location lives on Document (returns null for non-HTMLDocument). + testing.expectEqual(null, plainDoc.location); // Both should have common Document properties testing.expectEqual('string', typeof document.URL); diff --git a/src/browser/tests/xpath/document_evaluate.html b/src/browser/tests/xpath/document_evaluate.html new file mode 100644 index 00000000..2c4fdc58 --- /dev/null +++ b/src/browser/tests/xpath/document_evaluate.html @@ -0,0 +1,123 @@ + + + +

Hello

+
+

First

+

Second

+

Third

+
+ x + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/browser/tests/xpath/xpath_conformance.html b/src/browser/tests/xpath/xpath_conformance.html new file mode 100644 index 00000000..7080ecb6 --- /dev/null +++ b/src/browser/tests/xpath/xpath_conformance.html @@ -0,0 +1,201 @@ + + + + XPath conformance + + + +

Hello World

+

First paragraph with emphasis.

+

Second paragraph.

+ + + + + + + + +
NameAge
Alice30
Bob25
Carol40
+
+
AB
+
Click me
+
Other link
+
+
+ + + + + + +
+ +
+
+

One

+

Two

+

Three

+
+ + + + diff --git a/src/browser/tests/xpath/xpath_evaluator.html b/src/browser/tests/xpath/xpath_evaluator.html new file mode 100644 index 00000000..6cb6a886 --- /dev/null +++ b/src/browser/tests/xpath/xpath_evaluator.html @@ -0,0 +1,103 @@ + + + +

Hello

+

One

+

Two

+ + + + + + + + + + + + + + + + + + + diff --git a/src/browser/tests/xpath/xpath_perf.html b/src/browser/tests/xpath/xpath_perf.html new file mode 100644 index 00000000..21cac3fc --- /dev/null +++ b/src/browser/tests/xpath/xpath_perf.html @@ -0,0 +1,171 @@ + + + + XPath perf benchmark + + + + + + + + + + diff --git a/src/browser/tests/xpath/xpath_result.html b/src/browser/tests/xpath/xpath_result.html new file mode 100644 index 00000000..f7674e7b --- /dev/null +++ b/src/browser/tests/xpath/xpath_result.html @@ -0,0 +1,193 @@ + + + +

Hello

+

One

+

Two

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/browser/webapi/DOMImplementation.zig b/src/browser/webapi/DOMImplementation.zig index 777a9571..280db6a8 100644 --- a/src/browser/webapi/DOMImplementation.zig +++ b/src/browser/webapi/DOMImplementation.zig @@ -78,7 +78,7 @@ pub fn createDocument(_: *const DOMImplementation, namespace_: ?[]const u8, qual // Create and append root element if qualified_name provided if (qualified_name) |qname| { if (qname.len > 0) { - const namespace = if (namespace_) |ns| Node.Element.Namespace.parse(ns) else .xml; + const namespace = Node.Element.Namespace.parse(namespace_); const root = try frame.createElementNS(namespace, qname, null); _ = try document.asNode().appendChild(root, frame); } diff --git a/src/browser/webapi/Document.zig b/src/browser/webapi/Document.zig index 411eb678..ca9dc7a7 100644 --- a/src/browser/webapi/Document.zig +++ b/src/browser/webapi/Document.zig @@ -35,6 +35,8 @@ const DOMImplementation = @import("DOMImplementation.zig"); const StyleSheetList = @import("css/StyleSheetList.zig"); const FontFaceSet = @import("css/FontFaceSet.zig"); const Selection = @import("Selection.zig"); +const XPathResult = @import("XPathResult.zig"); +const XPathExpression = @import("XPathExpression.zig"); pub const XMLDocument = @import("XMLDocument.zig"); pub const HTMLDocument = @import("HTMLDocument.zig"); @@ -119,7 +121,18 @@ pub fn asEventTarget(self: *Document) *@import("EventTarget.zig") { } pub fn getURL(self: *const Document, frame: *const Frame) [:0]const u8 { - return self._url orelse frame.url; + return self._url orelse (self._frame orelse frame).url; +} + +pub fn getLocation(self: *const Document) ?*Location { + if (self._type != .html) return null; + const doc_frame = self._frame orelse return null; + return doc_frame.window._location; +} + +pub fn setLocation(self: *Document, url: [:0]const u8, frame: *Frame) !void { + if (self._type != .html) return; + return frame.scheduleNavigation(url, .{ .reason = .script, .kind = .{ .push = null } }, .{ .script = self._frame }); } pub fn getContentType(self: *const Document) []const u8 { @@ -277,11 +290,11 @@ pub fn getSelection(self: *Document) *Selection { } pub fn querySelector(self: *Document, input: String, frame: *Frame) !?*Element { - return Selector.querySelector(self.asNode(), input.str(), frame); + return Selector.querySelector(self.asNode(), input.str(), frame) catch |err| Selector.mapErrorToDOM(err); } pub fn querySelectorAll(self: *Document, input: String, frame: *Frame) !*Selector.List { - return Selector.querySelectorAll(self.asNode(), input.str(), frame); + return Selector.querySelectorAll(self.asNode(), input.str(), frame) catch |err| Selector.mapErrorToDOM(err); } pub fn getImplementation(self: *Document, frame: *Frame) !*DOMImplementation { @@ -412,6 +425,44 @@ pub fn createNodeIterator(_: *const Document, root: *Node, what_to_show: ?js.Val return DOMNodeIterator.init(root, try whatToShow(what_to_show), filter, frame); } +pub fn evaluate( + self: *Document, + expression: []const u8, + context_node: ?*Node, + resolver: ?js.Function, + result_type: ?u16, + result: ?*XPathResult, + frame: *Frame, +) !*XPathResult { + // resolver/result are no-ops in HTML mode (decision #2). + // Null/missing context_node falls back to the document — matches the + // polyfill (decision #2). Firefox throws TypeError on a *missing* + // arg, but the bridge can't distinguish "missing" from "explicit + // null" here, so polyfill parity wins for the ambiguity. + _ = resolver; + _ = result; + return XPathResult.fromExpression( + expression, + context_node orelse self.asNode(), + result_type orelse XPathResult.ANY_TYPE, + frame, + ); +} + +pub fn createExpression( + _: *const Document, + expression: []const u8, + resolver: ?js.Function, + frame: *Frame, +) !*XPathExpression { + _ = resolver; + return XPathExpression.init(expression, frame); +} + +pub fn createNSResolver(_: *const Document, node: *Node) ?*Node { + return node; +} + fn whatToShow(value_: ?js.Value) !u32 { const value = value_ orelse return 4294967295; // show all when undefined if (value.isUndefined()) { @@ -465,15 +516,21 @@ pub fn getFonts(self: *Document, frame: *Frame) !*FontFaceSet { return fonts; } -pub fn adoptNode(_: *const Document, node: *Node, frame: *Frame) !*Node { +pub fn adoptNode(self: *Document, node: *Node, frame: *Frame) !*Node { if (node._type == .document) { return error.NotSupported; } + const old_owner = node.ownerDocument(frame) orelse frame.document; + if (node._parent) |parent| { frame.removeNode(parent, node, .{ .will_be_reconnected = false }); } + if (old_owner != self) { + try frame.adoptNodeTree(node, old_owner, self); + } + return node; } @@ -666,7 +723,13 @@ fn writeInternal(self: *Document, text: []const []const u8, append_newline: bool if (self._script_created_parser) |*parser| { parser.read(html) catch |err| { log.warn(.dom, "document.write parser error", .{ .err = err }); - // was already closed + // html5ever's handle was destroyed inside read(), but the + // pending text buffer (if any) still wants to land on its + // text node's _data — flushPendingText doesn't depend on + // the handle, so attempt a final flush before dropping. + parser.parser.flushPendingText() catch |flush_err| { + log.warn(.dom, "flush after parser panic", .{ .err = flush_err }); + }; self._script_created_parser = null; }; } @@ -795,12 +858,12 @@ pub fn close(self: *Document, frame: *Frame) !void { return; } - // done() calls html5ever_streaming_parser_finish which frees the parser - // We must NOT call deinit() after done() as that would be a double-free - self._script_created_parser.?.done(); - // Just null out the handle since done() already freed it - self._script_created_parser.?.handle = null; - self._script_created_parser = null; + // done() finishes html5ever's handle and runs the final flushPendingText. + // Even if flushPendingText errors, the handle is already finished and we + // must not retain the Streaming — defer so the error path also drops it. + // (Streaming.done nulls its own handle, so dropping the struct is safe.) + defer self._script_created_parser = null; + try self._script_created_parser.?.done(); frame.documentIsComplete(); } @@ -1027,6 +1090,7 @@ pub const JsApi = struct { pub const onselectionchange = bridge.accessor(Document.getOnSelectionChange, Document.setOnSelectionChange, .{}); pub const URL = bridge.accessor(Document.getURL, null, .{}); + pub const location = bridge.accessor(Document.getLocation, Document.setLocation, .{}); pub const documentURI = bridge.accessor(Document.getURL, null, .{}); pub const documentElement = bridge.accessor(Document.getDocumentElement, null, .{}); pub const scrollingElement = bridge.accessor(Document.getDocumentElement, null, .{}); @@ -1051,6 +1115,9 @@ pub const JsApi = struct { pub const createEvent = bridge.function(Document.createEvent, .{ .dom_exception = true }); pub const createTreeWalker = bridge.function(Document.createTreeWalker, .{}); pub const createNodeIterator = bridge.function(Document.createNodeIterator, .{}); + pub const evaluate = bridge.function(Document.evaluate, .{ .dom_exception = true }); + pub const createExpression = bridge.function(Document.createExpression, .{ .dom_exception = true }); + pub const createNSResolver = bridge.function(Document.createNSResolver, .{}); pub const getElementById = bridge.function(_getElementById, .{}); fn _getElementById(self: *Document, value_: ?js.Value, frame: *Frame) !?*Element { const value = value_ orelse return null; @@ -1111,3 +1178,7 @@ const testing = @import("../../testing.zig"); test "WebApi: Document" { try testing.htmlRunner("document", .{}); } + +test "WebApi: Document.evaluate" { + try testing.htmlRunner("xpath/document_evaluate.html", .{}); +} diff --git a/src/browser/webapi/DocumentFragment.zig b/src/browser/webapi/DocumentFragment.zig index 186bc68a..b55050f2 100644 --- a/src/browser/webapi/DocumentFragment.zig +++ b/src/browser/webapi/DocumentFragment.zig @@ -84,11 +84,11 @@ pub fn getElementById(self: *DocumentFragment, id: []const u8) ?*Element { } pub fn querySelector(self: *DocumentFragment, selector: []const u8, frame: *Frame) !?*Element { - return Selector.querySelector(self.asNode(), selector, frame); + return Selector.querySelector(self.asNode(), selector, frame) catch |err| Selector.mapErrorToDOM(err); } pub fn querySelectorAll(self: *DocumentFragment, input: []const u8, frame: *Frame) !*Selector.List { - return Selector.querySelectorAll(self.asNode(), input, frame); + return Selector.querySelectorAll(self.asNode(), input, frame) catch |err| Selector.mapErrorToDOM(err); } pub fn getChildren(self: *DocumentFragment, frame: *Frame) !collections.NodeLive(.child_elements) { diff --git a/src/browser/webapi/Element.zig b/src/browser/webapi/Element.zig index 4de1a732..058875a6 100644 --- a/src/browser/webapi/Element.zig +++ b/src/browser/webapi/Element.zig @@ -1071,15 +1071,15 @@ pub fn getChildElementCount(self: *Element) usize { } pub fn matches(self: *Element, selector: []const u8, frame: *Frame) !bool { - return Selector.matches(self, selector, frame); + return Selector.matches(self, selector, frame) catch |err| Selector.mapErrorToDOM(err); } pub fn querySelector(self: *Element, selector: []const u8, frame: *Frame) !?*Element { - return Selector.querySelector(self.asNode(), selector, frame); + return Selector.querySelector(self.asNode(), selector, frame) catch |err| Selector.mapErrorToDOM(err); } pub fn querySelectorAll(self: *Element, input: []const u8, frame: *Frame) !*Selector.List { - return Selector.querySelectorAll(self.asNode(), input, frame); + return Selector.querySelectorAll(self.asNode(), input, frame) catch |err| Selector.mapErrorToDOM(err); } pub fn getAnimations(_: *const Element) []*Animation { diff --git a/src/browser/webapi/HTMLDocument.zig b/src/browser/webapi/HTMLDocument.zig index 41782cc8..19e462a1 100644 --- a/src/browser/webapi/HTMLDocument.zig +++ b/src/browser/webapi/HTMLDocument.zig @@ -196,15 +196,6 @@ pub fn getCurrentScript(self: *const HTMLDocument) ?*Element.Html.Script { return self._proto._current_script; } -pub fn getLocation(self: *const HTMLDocument) ?*@import("Location.zig") { - const frame = self._proto._frame orelse return null; - return frame.window._location; -} - -pub fn setLocation(self: *HTMLDocument, url: [:0]const u8, frame: *Frame) !void { - return frame.scheduleNavigation(url, .{ .reason = .script, .kind = .{ .push = null } }, .{ .script = self._proto._frame }); -} - pub fn getDir(self: *HTMLDocument) []const u8 { const el = self._proto.getDocumentElement() orelse return ""; const html = el.is(Element.Html) orelse return ""; @@ -311,7 +302,6 @@ pub const JsApi = struct { pub const applets = bridge.accessor(HTMLDocument.getApplets, null, .{}); pub const plugins = bridge.accessor(HTMLDocument.getEmbeds, null, .{}); pub const currentScript = bridge.accessor(HTMLDocument.getCurrentScript, null, .{}); - pub const location = bridge.accessor(HTMLDocument.getLocation, HTMLDocument.setLocation, .{}); pub const all = bridge.accessor(HTMLDocument.getAll, null, .{}); pub const cookie = bridge.accessor(HTMLDocument.getCookie, HTMLDocument.setCookie, .{}); pub const doctype = bridge.accessor(HTMLDocument.getDocType, null, .{}); diff --git a/src/browser/webapi/Node.zig b/src/browser/webapi/Node.zig index 7df1fd6a..c26411ed 100644 --- a/src/browser/webapi/Node.zig +++ b/src/browser/webapi/Node.zig @@ -166,7 +166,7 @@ pub fn findAdjacentNodes(self: *Node, position: []const u8) !struct { *Node, ?*N // Returned if: // * position is not one of the four listed values. // * The input is XML that is not well-formed. - return error.Syntax; + return error.SyntaxError; } pub fn firstChild(self: *const Node) ?*Node { diff --git a/src/browser/webapi/Performance.zig b/src/browser/webapi/Performance.zig index 22d37dd0..516d02cb 100644 --- a/src/browser/webapi/Performance.zig +++ b/src/browser/webapi/Performance.zig @@ -184,6 +184,11 @@ pub fn clearMeasures(self: *Performance, measure_name: ?[]const u8) void { } } +pub fn setResourceTimingBufferSize(self: *Performance, max_size: u32) void { + _ = self; + _ = max_size; +} + pub fn getEntries(self: *const Performance) []*Entry { return self._entries.items; } @@ -281,6 +286,7 @@ pub const JsApi = struct { pub const measure = bridge.function(Performance.measure, .{ .dom_exception = true }); pub const clearMarks = bridge.function(Performance.clearMarks, .{}); pub const clearMeasures = bridge.function(Performance.clearMeasures, .{}); + pub const setResourceTimingBufferSize = bridge.function(Performance.setResourceTimingBufferSize, .{ .noop = true }); pub const getEntries = bridge.function(Performance.getEntries, .{}); pub const getEntriesByType = bridge.function(Performance.getEntriesByType, .{}); pub const getEntriesByName = bridge.function(Performance.getEntriesByName, .{}); diff --git a/src/browser/webapi/Worker.zig b/src/browser/webapi/Worker.zig index 67d24526..5d571fa6 100644 --- a/src/browser/webapi/Worker.zig +++ b/src/browser/webapi/Worker.zig @@ -416,5 +416,8 @@ pub const JsApi = struct { const testing = @import("../../testing.zig"); test "WebApi: Worker" { - try testing.htmlRunner("worker", .{}); + // Worker tests chain a worker-script fetch with a dynamic-import fetch + // and a cross-context postMessage. The default 2 s assertion budget can + // blow up on TSAN CI; give it more room. + try testing.htmlRunner("worker", .{ .timeout_ms = 8000 }); } diff --git a/src/browser/webapi/XPathEvaluator.zig b/src/browser/webapi/XPathEvaluator.zig new file mode 100644 index 00000000..7cae18b9 --- /dev/null +++ b/src/browser/webapi/XPathEvaluator.zig @@ -0,0 +1,97 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! WHATWG `XPathEvaluator` — a stateless factory for XPath evaluation. +//! Mirrors `Document.evaluate` / `Document.createExpression` / +//! `Document.createNSResolver` so an explicit +//! `new XPathEvaluator()` instance can be used in place of the +//! document. + +const std = @import("std"); + +const js = @import("../js/js.zig"); +const Frame = @import("../Frame.zig"); + +const Node = @import("Node.zig"); +const XPathResult = @import("XPathResult.zig"); +const XPathExpression = @import("XPathExpression.zig"); + +const XPathEvaluator = @This(); + +// Padding to avoid zero-size struct identity_map collisions (matches +// the convention in ResizeObserver.zig). +_pad: bool = false, + +pub fn init() XPathEvaluator { + return .{}; +} + +pub fn evaluate( + _: *const XPathEvaluator, + expression: []const u8, + context_node: *Node, + resolver: ?js.Function, + requested_type: ?u16, + result: ?*XPathResult, + frame: *Frame, +) !*XPathResult { + // Namespace resolver is accepted-and-ignored (HTML mode — decision #2). + // Result reuse is also a no-op; XPathResult.fromExpression always + // allocates a fresh instance. + _ = resolver; + _ = result; + return XPathResult.fromExpression(expression, context_node, requested_type orelse XPathResult.ANY_TYPE, frame); +} + +pub fn createExpression( + _: *const XPathEvaluator, + expression: []const u8, + resolver: ?js.Function, + frame: *Frame, +) !*XPathExpression { + _ = resolver; + return XPathExpression.init(expression, frame); +} + +pub fn createNSResolver(_: *const XPathEvaluator, node: *Node) ?*Node { + // HTML-mode passthrough — the WHATWG IDL accepts a Node and returns + // an `XPathNSResolver`, but in practice the input node is reused. + return node; +} + +pub const JsApi = struct { + pub const bridge = js.Bridge(XPathEvaluator); + + pub const Meta = struct { + pub const name = "XPathEvaluator"; + pub const prototype_chain = bridge.prototypeChain(); + pub var class_id: bridge.ClassId = undefined; + pub const empty_with_no_proto = true; + }; + + pub const constructor = bridge.constructor(XPathEvaluator.init, .{}); + pub const evaluate = bridge.function(XPathEvaluator.evaluate, .{ .dom_exception = true }); + pub const createExpression = bridge.function(XPathEvaluator.createExpression, .{ .dom_exception = true }); + pub const createNSResolver = bridge.function(XPathEvaluator.createNSResolver, .{}); +}; + +const testing = @import("../../testing.zig"); + +test "WebApi: XPathEvaluator + XPathExpression" { + try testing.htmlRunner("xpath/xpath_evaluator.html", .{}); +} diff --git a/src/browser/webapi/XPathExpression.zig b/src/browser/webapi/XPathExpression.zig new file mode 100644 index 00000000..d801ac5a --- /dev/null +++ b/src/browser/webapi/XPathExpression.zig @@ -0,0 +1,105 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! WHATWG `XPathExpression` — a parsed XPath expression cached for +//! repeated evaluation. The parsed AST lives in this object's per- +//! instance arena (long-lived); each `evaluate()` call gets a fresh +//! arena for its own result data so multiple evaluations don't grow +//! the AST arena. + +const std = @import("std"); +const lp = @import("lightpanda"); + +const js = @import("../js/js.zig"); +const Page = @import("../Page.zig"); +const Frame = @import("../Frame.zig"); + +const Node = @import("Node.zig"); +const XPathResult = @import("XPathResult.zig"); + +const xpath = struct { + const Ast = @import("../xpath/ast.zig"); + const Parser = @import("../xpath/Parser.zig"); + const Evaluator = @import("../xpath/Evaluator.zig"); +}; + +const Allocator = std.mem.Allocator; + +const XPathExpression = @This(); + +_rc: lp.RC(u8) = .{}, +_arena: Allocator, +_expr: *const xpath.Ast.Expr, + +pub fn init(expression: []const u8, frame: *Frame) !*XPathExpression { + const arena = try frame.getArena(.tiny, "XPathExpression"); + errdefer frame.releaseArena(arena); + + // The AST borrows string slices from its input (literals, names, + // var refs, function names). `expression` is materialized in the JS + // call_arena and is reclaimed when the top-level call returns, so + // dupe into our long-lived arena before parsing. + const owned = try arena.dupe(u8, expression); + const expr = try xpath.Parser.parse(arena, owned); + const xe = try arena.create(XPathExpression); + xe.* = .{ ._arena = arena, ._expr = expr }; + return xe; +} + +pub fn evaluate( + self: *XPathExpression, + context_node: *Node, + requested_type: ?u16, + result: ?*XPathResult, + frame: *Frame, +) !*XPathResult { + // The `result` reuse parameter (WHATWG: optional XPathResult to + // populate) is accepted-and-ignored: we always allocate fresh, + // which matches every modern browser's effective behavior. + _ = result; + + const arena = try frame.getArena(.medium, "XPathResult"); + errdefer frame.releaseArena(arena); + + const eval_result = try xpath.Evaluator.evaluate(arena, self._expr, context_node, frame); + return XPathResult.fromResult(arena, requested_type orelse XPathResult.ANY_TYPE, eval_result); +} + +pub fn deinit(self: *XPathExpression, page: *Page) void { + page.releaseArena(self._arena); +} + +pub fn acquireRef(self: *XPathExpression) void { + self._rc.acquire(); +} + +pub fn releaseRef(self: *XPathExpression, page: *Page) void { + self._rc.release(self, page); +} + +pub const JsApi = struct { + pub const bridge = js.Bridge(XPathExpression); + + pub const Meta = struct { + pub const name = "XPathExpression"; + pub const prototype_chain = bridge.prototypeChain(); + pub var class_id: bridge.ClassId = undefined; + }; + + pub const evaluate = bridge.function(XPathExpression.evaluate, .{ .dom_exception = true }); +}; diff --git a/src/browser/webapi/XPathResult.zig b/src/browser/webapi/XPathResult.zig new file mode 100644 index 00000000..c029b48e --- /dev/null +++ b/src/browser/webapi/XPathResult.zig @@ -0,0 +1,288 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! WHATWG `XPathResult` (full surface, all 10 type constants — decision +//! #4). Wraps the evaluator's `result.Result` for JS consumption: +//! coerces to the requested result type at construction, exposes the +//! type-tagged accessors, and serves the iterator/snapshot APIs. +//! +//! Lifetime model: each `XPathResult` owns a per-instance arena +//! (`getArena(.medium, ...)`) that holds both the struct and the result +//! data (node-set slice, formatted strings). The arena is released in +//! `deinit` once the JS wrapper's refcount hits zero. +//! +//! Type-mismatch accessor calls return `error.InvalidStateError` — +//! translated to a `DOMException` by `bridge.function(.., .{ +//! .dom_exception = true })`. The WHATWG IDL technically specifies +//! `TypeError` for type mismatches, but `InvalidStateError` is what +//! decision #4 captures and what most legacy XPath consumers expect. + +const std = @import("std"); +const lp = @import("lightpanda"); + +const js = @import("../js/js.zig"); +const Page = @import("../Page.zig"); +const Frame = @import("../Frame.zig"); + +const Node = @import("Node.zig"); + +// XPath runtime helpers. Aliased to keep the cross-directory imports +// readable when both modules expose a `Result` type. +const xpath = struct { + const result = @import("../xpath/result.zig"); + const Parser = @import("../xpath/Parser.zig"); + const Evaluator = @import("../xpath/Evaluator.zig"); +}; + +const Allocator = std.mem.Allocator; + +const XPathResult = @This(); + +// WHATWG type constants. ANY_TYPE is a request flag — at construction +// it resolves to one of the four concrete categories (NUMBER, STRING, +// BOOLEAN, UNORDERED_NODE_ITERATOR) depending on what the expression +// produced. +pub const ANY_TYPE: u16 = 0; +pub const NUMBER_TYPE: u16 = 1; +pub const STRING_TYPE: u16 = 2; +pub const BOOLEAN_TYPE: u16 = 3; +pub const UNORDERED_NODE_ITERATOR_TYPE: u16 = 4; +pub const ORDERED_NODE_ITERATOR_TYPE: u16 = 5; +pub const UNORDERED_NODE_SNAPSHOT_TYPE: u16 = 6; +pub const ORDERED_NODE_SNAPSHOT_TYPE: u16 = 7; +pub const ANY_UNORDERED_NODE_TYPE: u16 = 8; +pub const FIRST_ORDERED_NODE_TYPE: u16 = 9; + +const Value = union(enum) { + number: f64, + string: []const u8, + boolean: bool, + nodes: []const *Node, +}; + +_rc: lp.RC(u8) = .{}, +_arena: Allocator, +_type: u16, +_value: Value, +_iter_pos: usize = 0, + +// ----- constructors ----- + +/// One-shot: parse + evaluate + wrap. Used by `Document.evaluate` and +/// `XPathEvaluator.evaluate`. Allocates a per-instance arena for the +/// AST + result data + the struct itself. +pub fn fromExpression( + expression: []const u8, + context_node: *Node, + requested_type: u16, + frame: *Frame, +) !*XPathResult { + const arena = try frame.getArena(.medium, "XPathResult"); + errdefer frame.releaseArena(arena); + + // The AST borrows string slices from its input (literals, names, + // var refs, function names). `expression` is materialized in the JS + // call_arena and is reclaimed when the top-level call returns, so + // dupe into our long-lived arena before parsing. + const owned = try arena.dupe(u8, expression); + const expr = try xpath.Parser.parse(arena, owned); + const result = try xpath.Evaluator.evaluate(arena, expr, context_node, frame); + return fromResult(arena, requested_type, result); +} + +/// Wrap an already-evaluated `result.result` into an XPathResult. The +/// caller hands over ownership of `arena` — the XPathResult will release +/// it on deinit. Used by `XPathExpression.evaluate` (which has its own +/// AST cache and only allocates a fresh result arena). +pub fn fromResult( + arena: Allocator, + requested_type: u16, + result: xpath.result.Result, +) !*XPathResult { + const value: Value = switch (requested_type) { + ANY_TYPE => switch (result) { + .number => |n| .{ .number = n }, + .string => |s| .{ .string = s }, + .boolean => |b| .{ .boolean = b }, + .node_set => |ns| .{ .nodes = ns }, + }, + NUMBER_TYPE => .{ .number = try xpath.result.toNumber(arena, result) }, + STRING_TYPE => .{ .string = try xpath.result.toString(arena, result) }, + BOOLEAN_TYPE => .{ .boolean = xpath.result.toBoolean(result) }, + UNORDERED_NODE_ITERATOR_TYPE, + ORDERED_NODE_ITERATOR_TYPE, + UNORDERED_NODE_SNAPSHOT_TYPE, + ORDERED_NODE_SNAPSHOT_TYPE, + ANY_UNORDERED_NODE_TYPE, + FIRST_ORDERED_NODE_TYPE, + => switch (result) { + .node_set => |ns| .{ .nodes = ns }, + // Requesting a node-set type for a non-node-set expression. + // WHATWG specifies TypeError, but DOMException.fromError has + // no TypeError mapping (would surface as a plain JS Error); + // unify on InvalidStateError per the project plan. + else => return error.InvalidStateError, + }, + else => return error.InvalidStateError, + }; + + const final_type: u16 = if (requested_type == ANY_TYPE) switch (value) { + .number => NUMBER_TYPE, + .string => STRING_TYPE, + .boolean => BOOLEAN_TYPE, + .nodes => UNORDERED_NODE_ITERATOR_TYPE, + } else requested_type; + + const xr = try arena.create(XPathResult); + xr.* = .{ + ._arena = arena, + ._type = final_type, + ._value = value, + }; + return xr; +} + +// ----- lifecycle ----- + +pub fn deinit(self: *XPathResult, page: *Page) void { + page.releaseArena(self._arena); +} + +pub fn acquireRef(self: *XPathResult) void { + self._rc.acquire(); +} + +pub fn releaseRef(self: *XPathResult, page: *Page) void { + self._rc.release(self, page); +} + +// ----- accessors ----- + +fn getResultType(self: *const XPathResult) u16 { + return self._type; +} + +fn getNumberValue(self: *const XPathResult) !f64 { + if (self._type != NUMBER_TYPE) return error.InvalidStateError; + return self._value.number; +} + +fn getStringValue(self: *const XPathResult) ![]const u8 { + if (self._type != STRING_TYPE) return error.InvalidStateError; + return self._value.string; +} + +fn getBooleanValue(self: *const XPathResult) !bool { + if (self._type != BOOLEAN_TYPE) return error.InvalidStateError; + return self._value.boolean; +} + +fn getSingleNodeValue(self: *const XPathResult) !?*Node { + if (self._type != ANY_UNORDERED_NODE_TYPE and self._type != FIRST_ORDERED_NODE_TYPE) { + return error.InvalidStateError; + } + return if (self._value.nodes.len == 0) null else self._value.nodes[0]; +} + +fn getSnapshotLength(self: *const XPathResult) !u32 { + if (self._type != UNORDERED_NODE_SNAPSHOT_TYPE and self._type != ORDERED_NODE_SNAPSHOT_TYPE) { + return error.InvalidStateError; + } + return @intCast(self._value.nodes.len); +} + +/// Live mutation tracking on the iterator isn't implemented — we hold a +/// frozen pointer slice, so the iterator is never "invalidated" by DOM +/// edits during traversal. Always returns false; matches the polyfill, +/// which is snapshot-only. +fn getInvalidIteratorState(_: *const XPathResult) bool { + return false; +} + +// ----- methods ----- + +pub fn iterateNext(self: *XPathResult) !?*Node { + if (self._type != UNORDERED_NODE_ITERATOR_TYPE and self._type != ORDERED_NODE_ITERATOR_TYPE) { + return error.InvalidStateError; + } + const pos = self._iter_pos; + if (pos >= self._value.nodes.len) return null; + const node = self._value.nodes[pos]; + self._iter_pos = pos + 1; + return node; +} + +pub fn snapshotItem(self: *const XPathResult, index: u32) !?*Node { + if (self._type != UNORDERED_NODE_SNAPSHOT_TYPE and self._type != ORDERED_NODE_SNAPSHOT_TYPE) { + return error.InvalidStateError; + } + if (index >= self._value.nodes.len) return null; + return self._value.nodes[index]; +} + +// ----- JS bridge ----- + +pub const JsApi = struct { + pub const bridge = js.Bridge(XPathResult); + + pub const Meta = struct { + pub const name = "XPathResult"; + pub const prototype_chain = bridge.prototypeChain(); + pub var class_id: bridge.ClassId = undefined; + }; + + // Type constants — both static (on the constructor) and instance + // properties per the WHATWG IDL. `template = true` makes them + // class-level so `XPathResult.ORDERED_NODE_SNAPSHOT_TYPE` works. + pub const ANY_TYPE = bridge.property(XPathResult.ANY_TYPE, .{ .template = true }); + pub const NUMBER_TYPE = bridge.property(XPathResult.NUMBER_TYPE, .{ .template = true }); + pub const STRING_TYPE = bridge.property(XPathResult.STRING_TYPE, .{ .template = true }); + pub const BOOLEAN_TYPE = bridge.property(XPathResult.BOOLEAN_TYPE, .{ .template = true }); + pub const UNORDERED_NODE_ITERATOR_TYPE = bridge.property(XPathResult.UNORDERED_NODE_ITERATOR_TYPE, .{ .template = true }); + pub const ORDERED_NODE_ITERATOR_TYPE = bridge.property(XPathResult.ORDERED_NODE_ITERATOR_TYPE, .{ .template = true }); + pub const UNORDERED_NODE_SNAPSHOT_TYPE = bridge.property(XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE, .{ .template = true }); + pub const ORDERED_NODE_SNAPSHOT_TYPE = bridge.property(XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, .{ .template = true }); + pub const ANY_UNORDERED_NODE_TYPE = bridge.property(XPathResult.ANY_UNORDERED_NODE_TYPE, .{ .template = true }); + pub const FIRST_ORDERED_NODE_TYPE = bridge.property(XPathResult.FIRST_ORDERED_NODE_TYPE, .{ .template = true }); + + pub const resultType = bridge.accessor(XPathResult.getResultType, null, .{}); + pub const numberValue = bridge.accessor(XPathResult.getNumberValue, null, .{ .dom_exception = true }); + pub const stringValue = bridge.accessor(XPathResult.getStringValue, null, .{ .dom_exception = true }); + pub const booleanValue = bridge.accessor(XPathResult.getBooleanValue, null, .{ .dom_exception = true }); + pub const singleNodeValue = bridge.accessor(XPathResult.getSingleNodeValue, null, .{ .dom_exception = true }); + pub const snapshotLength = bridge.accessor(XPathResult.getSnapshotLength, null, .{ .dom_exception = true }); + pub const invalidIteratorState = bridge.accessor(XPathResult.getInvalidIteratorState, null, .{}); + + pub const iterateNext = bridge.function(XPathResult.iterateNext, .{ .dom_exception = true }); + pub const snapshotItem = bridge.function(XPathResult.snapshotItem, .{ .dom_exception = true }); +}; + +const testing = @import("../../testing.zig"); + +test "WebApi: XPathResult" { + try testing.htmlRunner("xpath/xpath_result.html", .{}); +} + +test "WebApi: XPath conformance" { + try testing.htmlRunner("xpath/xpath_conformance.html", .{}); +} + +// This uses console.warn, uncomment if you want to run it +// test "WebApi: XPath perf" { +// try testing.htmlRunner("xpath/xpath_perf.html", .{}); +// } diff --git a/src/browser/webapi/element/html/Link.zig b/src/browser/webapi/element/html/Link.zig index 8f561355..1b83dacf 100644 --- a/src/browser/webapi/element/html/Link.zig +++ b/src/browser/webapi/element/html/Link.zig @@ -71,8 +71,16 @@ pub fn setAs(self: *Link, value: []const u8, frame: *Frame) !void { return self.asElement().setAttributeSafe(comptime .wrap("as"), .wrap(value), frame); } +pub fn getMedia(self: *Link) []const u8 { + return self.asElement().getAttributeSafe(comptime .wrap("media")) orelse return ""; +} + +pub fn setMedia(self: *Link, value: []const u8, frame: *Frame) !void { + return self.asElement().setAttributeSafe(comptime .wrap("media"), .wrap(value), frame); +} + pub fn getCrossOrigin(self: *const Link) ?[]const u8 { - return self.asConstElement().getAttributeSafe(comptime .wrap("crossOrigin")); + return self.asConstElement().getAttributeSafe(comptime .wrap("crossorigin")); } pub fn setCrossOrigin(self: *Link, value: []const u8, frame: *Frame) !void { @@ -80,7 +88,7 @@ pub fn setCrossOrigin(self: *Link, value: []const u8, frame: *Frame) !void { if (std.ascii.eqlIgnoreCase(value, "use-credentials")) { normalized = "use-credentials"; } - return self.asElement().setAttributeSafe(comptime .wrap("crossOrigin"), .wrap(normalized), frame); + return self.asElement().setAttributeSafe(comptime .wrap("crossorigin"), .wrap(normalized), frame); } pub fn linkAddedCallback(self: *Link, frame: *Frame) !void { @@ -120,6 +128,7 @@ pub const JsApi = struct { pub const as = bridge.accessor(Link.getAs, Link.setAs, .{}); pub const rel = bridge.accessor(Link.getRel, Link.setRel, .{}); + pub const media = bridge.accessor(Link.getMedia, Link.setMedia, .{}); pub const href = bridge.accessor(Link.getHref, Link.setHref, .{}); pub const crossOrigin = bridge.accessor(Link.getCrossOrigin, Link.setCrossOrigin, .{}); pub const relList = bridge.accessor(_getRelList, null, .{ .null_as_undefined = true }); diff --git a/src/browser/webapi/selector/Selector.zig b/src/browser/webapi/selector/Selector.zig index 2591ce6c..7322e02e 100644 --- a/src/browser/webapi/selector/Selector.zig +++ b/src/browser/webapi/selector/Selector.zig @@ -28,6 +28,22 @@ pub const List = @import("List.zig"); const String = lp.String; const Allocator = std.mem.Allocator; +// translate a Selector error to a DOMException known type. +pub fn mapErrorToDOM(err: anyerror) anyerror { + return switch (err) { + error.InvalidSelector, + error.InvalidAttributeSelector, + error.InvalidIDSelector, + error.InvalidClassSelector, + error.UnknownPseudoClass, + error.InvalidTagSelector, + error.InvalidPseudoClass, + error.InvalidNthPattern, + => error.SyntaxError, + else => err, + }; +} + pub fn parseLeaky(arena: Allocator, input: []const u8) !Parsed { if (input.len == 0) { return error.SyntaxError; diff --git a/src/browser/xpath/Evaluator.zig b/src/browser/xpath/Evaluator.zig new file mode 100644 index 00000000..00788023 --- /dev/null +++ b/src/browser/xpath/Evaluator.zig @@ -0,0 +1,987 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! XPath 1.0 evaluator — runs an `ast.Expr` against a context node and +//! produces a `Result`. The evaluator allocates intermediate values +//! (node-set slices, formatted numbers, materialized attribute nodes) +//! into the caller's arena. The context `Frame` is needed for +//! `getElementById` and to materialize attributes (the attribute axis +//! returns full `Attribute` nodes so the result is `*Node`-uniform). +//! +//! Document-order sort happens once at the public boundary +//! (`evaluate()`); intermediate step results stay in axis order so +//! reverse-axis positional predicates evaluate against proximity. + +const std = @import("std"); +const lp = @import("lightpanda"); + +const Node = @import("../webapi/Node.zig"); + +const ast = @import("ast.zig"); +const Parser = @import("Parser.zig"); +const result = @import("result.zig"); +const functions = @import("functions.zig"); + +const Frame = lp.Frame; +const Element = Node.Element; +const Document = Node.Document; +const Allocator = std.mem.Allocator; + +const Evaluator = @This(); + +pub const Error = error{ + OutOfMemory, + WriteFailed, + // Surfaces from Attribute materialization (`Entry.toAttribute` → + // `String.dupe` enforces a length limit). The polyfill never hits + // this since JS strings are unbounded, but Lightpanda's `String` + // type caps at u32::MAX bytes — propagate so callers can surface + // a DOM exception. + StringTooLarge, + UnknownFunction, + UnionRequiresNodeSets, +}; + +arena: Allocator, +frame: *Frame, + +/// Public entry. Returns the AST's value; node-sets are sorted into +/// document order before return per XPath spec §3.3. +pub fn evaluate(arena: Allocator, expr: *const ast.Expr, context_node: *Node, frame: *Frame) Error!result.Result { + var ev = Evaluator{ .arena = arena, .frame = frame }; + const res = try ev.evalExpr(expr, context_node, 1, 1); + if (res == .node_set) { + sortDocOrder(@constCast(res.node_set)); + } + return res; +} + +pub const SearchError = Error || Parser.Error; + +/// Convenience for `DOM.performSearch`: parse + evaluate and unwrap the +/// node-set. Top-level scalar expressions yield an empty slice +/// (decision #3 — these APIs are for finding nodes, not arbitrary +/// computation). +pub fn searchAll(arena: Allocator, root: *Node, expression: []const u8, frame: *Frame) SearchError![]const *Node { + const expr = try Parser.parse(arena, expression); + return switch (try evaluate(arena, expr, root, frame)) { + .node_set => |ns| ns, + else => &.{}, + }; +} + +// ----- AST evaluation ----- + +fn evalExpr(self: *Evaluator, expr: *const ast.Expr, ctx: *Node, pos: usize, size: usize) Error!result.Result { + return switch (expr.*) { + .number => |n| .{ .number = n }, + .literal => |s| .{ .string = s }, + .var_ref => .{ .string = "" }, // decision #3 stub + .neg => |inner| blk: { + const v = try self.evalExpr(inner, ctx, pos, size); + const n = try result.toNumber(self.arena, v); + break :blk .{ .number = -n }; + }, + .binop => |bo| try self.evalBinop(bo, ctx, pos, size), + .path => |p| try self.evalPath(p, ctx), + .filter_path => |fp| try self.evalFilterPath(fp, ctx, pos, size), + .filter => |f| try self.evalFilter(f, ctx, pos, size), + .fn_call => |fc| try self.evalFnCall(fc, ctx, pos, size), + }; +} + +fn evalPath(self: *Evaluator, path: ast.Path, ctx: *Node) Error!result.Result { + if (try self.tryIdLookupFastPath(path, ctx)) |res| return res; + if (try self.tryFusedDescendantFastPath(path, ctx)) |res| return res; + + const start: *Node = if (path.absolute) blk: { + if (ctx._type == .document) break :blk ctx; + const owner = ctx.ownerDocument(self.frame) orelse break :blk ctx; + break :blk owner.asNode(); + } else ctx; + + var current = try self.arena.alloc(*Node, 1); + current[0] = start; + var current_set: []const *Node = current; + + for (path.steps) |step| { + const r = try self.evalStep(current_set, step); + current_set = r.node_set; + } + return .{ .node_set = current_set }; +} + +// Recognize the very common `//tag[@id='x']` and `.//tag[@id='x']` +// shapes (and their wildcard `//*[@id='x']` variants) and serve them +// directly from `frame.getElementByIdFromNode`. Accepts the literal on +// either side of `=`. +// +// Mirrors the same tradeoff `webapi/selector/List.zig:optimizeSelector` +// already makes for `querySelector(All)`: the id-map only stores the +// first element per ID in document order, so duplicate IDs (invalid +// HTML, but possible) yield one match here where a strict tree walk +// would find all. Acceptable because Capybara/Selenium hot paths +// assume unique IDs and CSS has shipped this compromise for years. +// +// Falls through to the general path for any deviation: extra steps, +// extra predicates, non-eq predicate, non-literal RHS, or the +// inability to resolve a search root. +fn tryIdLookupFastPath(self: *Evaluator, path: ast.Path, ctx: *Node) Error!?result.Result { + // Two acceptable AST shapes: + // //tag[@id='x'] parses to: ds::node() / child::tag[pred] + // .//tag[@id='x'] parses to: self::node() / ds::node() / child::tag[pred] + const target = matchDescendantPathShape(path) orelse return null; + + if (target.axis != .child) return null; + if (target.predicates.len != 1) return null; + + // Tag name (null = wildcard "*"). type_test (e.g. `node()`, + // `text()`) doesn't qualify because getElementByIdFromNode only + // returns elements. + const tag_name: ?[]const u8 = switch (target.node_test) { + .name => |n| if (std.mem.eql(u8, n, "*")) null else n, + .type_test => return null, + }; + + const id_value = matchAttrEqLiteral(target.predicates[0], "id") orelse return null; + + // Resolve search root the same way the general path does. + const search_root: *Node = if (path.absolute) blk: { + if (ctx._type == .document) break :blk ctx; + const owner = ctx.ownerDocument(self.frame) orelse return null; + break :blk owner.asNode(); + } else ctx; + + const id_element = self.frame.getElementByIdFromNode(search_root, id_value) orelse { + return .{ .node_set = &.{} }; + }; + const id_node = id_element.asNode(); + + // Relative paths must filter to descendants of the context. + // getElementByIdFromNode is doc-wide. + if (search_root != id_node and !search_root.contains(id_node)) { + return .{ .node_set = &.{} }; + } + + // Tag check (case-insensitive per decision #2). Element tag names + // are stored lowercase via `getTagNameLower`; lowercase the AST + // name once and compare. + if (tag_name) |tag| { + const lowered = try std.ascii.allocLowerString(self.arena, tag); + if (!std.mem.eql(u8, lowered, id_element.getTagNameLower())) { + return .{ .node_set = &.{} }; + } + } + + const out = try self.arena.alloc(*Node, 1); + out[0] = id_node; + return .{ .node_set = out }; +} + +// Generalization of `tryIdLookupFastPath` to non-ID predicates. Same +// AST shape (`//[preds]` / `.//[preds]`), but instead of +// dispatching to `getElementByIdFromNode`, walks the descendants of +// the search root once in document order, applying the node test and +// any "safe" non-positional predicates inline. Skips the general path's +// per-step axis materialization, the per-step `filtered`/`current` +// ArrayLists, and the dedup hash map (single-context forward walk +// already preserves doc order). +// +// Hits the bulk of the benchmark's remaining cost: `//div`, `//*`, +// `//*[@class='x']`, `//div[@class='x']`, `//div[contains(@class,'x')]`. +// +// "Safe" predicates: not numeric at the top level (number, neg, +// arithmetic binop, or a fn-call returning a number), and free of +// `position()`/`last()` anywhere in the predicate AST. Numeric predicates +// would need `position()` context which the fused walk doesn't track, +// and a `position()`/`last()` reference inside a sub-path's own step is +// rejected conservatively even though it's local to that sub-axis. +fn tryFusedDescendantFastPath(self: *Evaluator, path: ast.Path, ctx: *Node) Error!?result.Result { + const target = matchDescendantPathShape(path) orelse return null; + if (target.axis != .child) return null; + + for (target.predicates) |p| { + if (!isSafeNonPositionalPredicate(p)) return null; + } + + const lowered_name: ?[]const u8 = switch (target.node_test) { + .name => |n| if (std.mem.eql(u8, n, "*")) null else try std.ascii.allocLowerString(self.arena, n), + .type_test => null, + }; + + const search_root: *Node = if (path.absolute) blk: { + if (ctx._type == .document) break :blk ctx; + const owner = ctx.ownerDocument(self.frame) orelse return null; + break :blk owner.asNode(); + } else ctx; + + var out: std.ArrayList(*Node) = .empty; + try self.fusedDescend(search_root, target, lowered_name, &out); + return .{ .node_set = out.items }; +} + +fn fusedDescend( + self: *Evaluator, + parent: *Node, + target: ast.Step, + lowered_name: ?[]const u8, + out: *std.ArrayList(*Node), +) Error!void { + var it = parent.childrenIterator(); + while (it.next()) |c| { + if (matchTest(c, target.node_test, target.axis, lowered_name)) { + var ok = true; + for (target.predicates) |pred| { + // Position / size are synthetic. Safe because the + // predicate-safety gate already rejected any expression + // that depends on either. + const val = try self.evalExpr(pred, c, 1, 1); + if (!result.toBoolean(val)) { + ok = false; + break; + } + } + if (ok) try out.append(self.arena, c); + } + try self.fusedDescend(c, target, lowered_name, out); + } +} + +fn matchDescendantPathShape(path: ast.Path) ?ast.Step { + return switch (path.steps.len) { + 2 => blk: { + if (!isDescendantOrSelfNode(path.steps[0])) break :blk null; + break :blk path.steps[1]; + }, + 3 => blk: { + if (!isSelfNode(path.steps[0])) break :blk null; + if (!isDescendantOrSelfNode(path.steps[1])) break :blk null; + break :blk path.steps[2]; + }, + else => null, + }; +} + +fn isSafeNonPositionalPredicate(expr: *const ast.Expr) bool { + if (isNumericTopLevel(expr)) return false; + if (containsPositionOrLast(expr)) return false; + return true; +} + +fn isNumericTopLevel(expr: *const ast.Expr) bool { + return switch (expr.*) { + .number, .neg => true, + .binop => |bo| switch (bo.op) { + .add, .sub, .mul, .div, .mod => true, + else => false, + }, + .fn_call => |fc| isNumericFnName(fc.name), + else => false, + }; +} + +fn isNumericFnName(name: []const u8) bool { + const numeric = [_][]const u8{ + "position", "last", "count", "sum", + "floor", "ceiling", "round", "number", + "string-length", + }; + for (numeric) |n| { + if (std.mem.eql(u8, name, n)) return true; + } + return false; +} + +fn containsPositionOrLast(expr: *const ast.Expr) bool { + return switch (expr.*) { + .number, .literal, .var_ref => false, + .neg => |inner| containsPositionOrLast(inner), + .binop => |bo| containsPositionOrLast(bo.left) or containsPositionOrLast(bo.right), + .filter => |f| containsPositionOrLast(f.expr) or containsPositionOrLast(f.predicate), + .filter_path => |fp| containsPositionOrLast(fp.filter) or stepsContainPositionOrLast(fp.steps), + .path => |p| stepsContainPositionOrLast(p.steps), + .fn_call => |fc| std.mem.eql(u8, fc.name, "position") or + std.mem.eql(u8, fc.name, "last") or + argsContainPositionOrLast(fc.args), + }; +} + +fn stepsContainPositionOrLast(steps: []const ast.Step) bool { + for (steps) |s| { + for (s.predicates) |p| { + if (containsPositionOrLast(p)) return true; + } + } + return false; +} + +fn argsContainPositionOrLast(args: []const *ast.Expr) bool { + for (args) |a| { + if (containsPositionOrLast(a)) return true; + } + return false; +} + +fn isDescendantOrSelfNode(s: ast.Step) bool { + if (s.axis != .descendant_or_self) return false; + if (s.predicates.len != 0) return false; + return switch (s.node_test) { + .type_test => |k| k == .node, + .name => false, + }; +} + +fn isSelfNode(s: ast.Step) bool { + if (s.axis != .self) return false; + if (s.predicates.len != 0) return false; + return switch (s.node_test) { + .type_test => |k| k == .node, + .name => false, + }; +} + +fn matchAttrEqLiteral(expr: *const ast.Expr, attr_name: []const u8) ?[]const u8 { + if (expr.* != .binop) return null; + const bo = expr.binop; + if (bo.op != .eq) return null; + if (isAttrPath(bo.left, attr_name) and bo.right.* == .literal) return bo.right.literal; + if (isAttrPath(bo.right, attr_name) and bo.left.* == .literal) return bo.left.literal; + return null; +} + +fn isAttrPath(expr: *const ast.Expr, attr_name: []const u8) bool { + if (expr.* != .path) return false; + const p = expr.path; + if (p.absolute) return false; + if (p.steps.len != 1) return false; + const s = p.steps[0]; + if (s.axis != .attribute) return false; + if (s.predicates.len != 0) return false; + return switch (s.node_test) { + .name => |n| std.mem.eql(u8, n, attr_name), + .type_test => false, + }; +} + +fn evalFilterPath(self: *Evaluator, fp: ast.FilterPath, ctx: *Node, pos: usize, size: usize) Error!result.Result { + const base = try self.evalExpr(fp.filter, ctx, pos, size); + if (base != .node_set) return base; + + var current: []const *Node = base.node_set; + for (fp.steps) |step| { + const r = try self.evalStep(current, step); + current = r.node_set; + } + return .{ .node_set = current }; +} + +fn evalFilter(self: *Evaluator, f: ast.Filter, ctx: *Node, pos: usize, size: usize) Error!result.Result { + const base = try self.evalExpr(f.expr, ctx, pos, size); + if (base != .node_set) return base; + + var out: std.ArrayList(*Node) = .empty; + const sz = base.node_set.len; + for (base.node_set, 0..) |n, idx| { + const k = idx + 1; + const val = try self.evalExpr(f.predicate, n, k, sz); + if (predicateMatches(val, k)) try out.append(self.arena, n); + } + return .{ .node_set = out.items }; +} + +// ----- step + axis ----- + +fn evalStep(self: *Evaluator, ctx_nodes: []const *Node, step: ast.Step) Error!result.Result { + var dedup: std.AutoArrayHashMapUnmanaged(*Node, void) = .empty; + + // Pre-lowercase the name test once per step. matchNameTest does + // case-insensitive matching (decision #2); without this hoist, every + // axis node would pay the per-byte case-fold inside `eqlIgnoreCase`. + const lowered_name: ?[]const u8 = switch (step.node_test) { + .name => |n| if (std.mem.eql(u8, n, "*")) null else try std.ascii.allocLowerString(self.arena, n), + .type_test => null, + }; + + for (ctx_nodes) |ctx| { + const axis_nodes = try self.axisNodes(ctx, step.axis); + + var filtered: std.ArrayList(*Node) = .empty; + for (axis_nodes) |n| { + if (matchTest(n, step.node_test, step.axis, lowered_name)) { + try filtered.append(self.arena, n); + } + } + + var current: []const *Node = filtered.items; + for (step.predicates) |pred| { + var next: std.ArrayList(*Node) = .empty; + const sz = current.len; + for (current, 0..) |n, idx| { + const k = idx + 1; + const val = try self.evalExpr(pred, n, k, sz); + if (predicateMatches(val, k)) try next.append(self.arena, n); + } + current = next.items; + } + + for (current) |n| try dedup.put(self.arena, n, {}); + } + + return .{ .node_set = dedup.keys() }; +} + +fn axisNodes(self: *Evaluator, node: *Node, axis: ast.Axis) Error![]const *Node { + var out: std.ArrayList(*Node) = .empty; + switch (axis) { + .child => { + var it = node.childrenIterator(); + while (it.next()) |c| try out.append(self.arena, c); + }, + .descendant => try self.appendDescendants(node, &out), + .descendant_or_self => { + try out.append(self.arena, node); + try self.appendDescendants(node, &out); + }, + .self => try out.append(self.arena, node), + .parent => { + if (node.parentNode()) |p| try out.append(self.arena, p); + }, + // Reverse axes — proximity order (nearest first). Final node-set + // is sorted to document order at the public boundary. + .ancestor => { + var p = node.parentNode(); + while (p) |n| : (p = n.parentNode()) try out.append(self.arena, n); + }, + .ancestor_or_self => { + try out.append(self.arena, node); + var p = node.parentNode(); + while (p) |n| : (p = n.parentNode()) try out.append(self.arena, n); + }, + .following_sibling => { + var s = node.nextSibling(); + while (s) |n| : (s = n.nextSibling()) try out.append(self.arena, n); + }, + .preceding_sibling => { + var s = node.previousSibling(); + while (s) |n| : (s = n.previousSibling()) try out.append(self.arena, n); + }, + .following => try self.appendFollowing(node, &out), + .preceding => try self.appendPreceding(node, &out), + .attribute => try self.appendAttributes(node, &out), + .namespace, .unknown => {}, // decision #3 stubs + } + return out.items; +} + +fn appendDescendants(self: *Evaluator, node: *Node, out: *std.ArrayList(*Node)) Error!void { + var it = node.childrenIterator(); + while (it.next()) |c| { + try out.append(self.arena, c); + try self.appendDescendants(c, out); + } +} + +fn appendFollowing(self: *Evaluator, start: *Node, out: *std.ArrayList(*Node)) Error!void { + var n: ?*Node = start; + while (n) |cur| : (n = cur.parentNode()) { + var s = cur.nextSibling(); + while (s) |sn| : (s = sn.nextSibling()) { + try out.append(self.arena, sn); + try self.appendDescendants(sn, out); + } + } +} + +fn appendPrecedingSubtree(self: *Evaluator, n: *Node, out: *std.ArrayList(*Node)) Error!void { + // Reverse document order: deepest-last children first, then self. + var c = n.lastChild(); + while (c) |child| : (c = child.previousSibling()) { + try self.appendPrecedingSubtree(child, out); + } + try out.append(self.arena, n); +} + +fn appendPreceding(self: *Evaluator, start: *Node, out: *std.ArrayList(*Node)) Error!void { + var n: ?*Node = start; + while (n) |cur| { + const parent = cur.parentNode() orelse break; + var s = cur.previousSibling(); + while (s) |sn| : (s = sn.previousSibling()) { + try self.appendPrecedingSubtree(sn, out); + } + n = parent; + } +} + +fn appendAttributes(self: *Evaluator, node: *Node, out: *std.ArrayList(*Node)) Error!void { + const el = node.is(Element) orelse return; + var it = el.attributeIterator(); + while (it.next()) |entry| { + // Memoize via frame._attribute_lookup so repeated XPath queries + // (Capybara/Selenium polling) reuse the same *Attribute instead + // of leaking fresh ones into page-lifetime storage on every call. + // Same pattern as Attribute.List.getAttribute / NamedNodeMap.getAtIndex. + const gop = try self.frame._attribute_lookup.getOrPut(self.frame.arena, @intFromPtr(entry)); + if (!gop.found_existing) { + gop.value_ptr.* = try entry.toAttribute(el, self.frame); + } + try out.append(self.arena, gop.value_ptr.*._proto); + } +} + +// ----- node test matching ----- + +fn matchTest(node: *Node, test_: ast.NodeTest, axis: ast.Axis, lowered_name: ?[]const u8) bool { + return switch (test_) { + .type_test => |kind| switch (kind) { + .node => true, + // XPath 1.0 §5.7: the data model has no CDATASection node — + // CDATA content is part of the text node value. Match both + // Text (3) and CDATASection (4) DOM node types. + .text => node.getNodeType() == 3 or node.getNodeType() == 4, + .comment => node.getNodeType() == 8, + .processing_instruction => node.getNodeType() == 7, + }, + .name => |name| matchNameTest(node, name, axis, lowered_name), + }; +} + +fn matchNameTest(node: *Node, name: []const u8, axis: ast.Axis, lowered_name: ?[]const u8) bool { + // `lowered_name` is non-null iff `name != "*"`. Element tag names + // (`getTagNameLower`) and html5ever-stored attribute names are already + // lowercase, so a plain `mem.eql` against the pre-lowered test name + // replaces the per-call `eqlIgnoreCase`. + if (axis == .attribute) { + if (std.mem.eql(u8, name, "*")) return node._type == .attribute; + const attr = switch (node._type) { + .attribute => |a| a, + else => return false, + }; + return std.mem.eql(u8, attr._name.str(), lowered_name.?); + } + const el = node.is(Element) orelse return false; + if (std.mem.eql(u8, name, "*")) return true; + return std.mem.eql(u8, el.getTagNameLower(), lowered_name.?); +} + +// ----- binop ----- + +fn evalBinop(self: *Evaluator, bo: ast.BinOp, ctx: *Node, pos: usize, size: usize) Error!result.Result { + switch (bo.op) { + .or_ => { + const l = try self.evalExpr(bo.left, ctx, pos, size); + if (result.toBoolean(l)) return .{ .boolean = true }; + const r = try self.evalExpr(bo.right, ctx, pos, size); + return .{ .boolean = result.toBoolean(r) }; + }, + .and_ => { + const l = try self.evalExpr(bo.left, ctx, pos, size); + if (!result.toBoolean(l)) return .{ .boolean = false }; + const r = try self.evalExpr(bo.right, ctx, pos, size); + return .{ .boolean = result.toBoolean(r) }; + }, + .eq, .neq, .lt, .gt, .lte, .gte => { + const l = try self.evalExpr(bo.left, ctx, pos, size); + const r = try self.evalExpr(bo.right, ctx, pos, size); + return .{ .boolean = try self.xCmp(l, r, bo.op) }; + }, + .add, .sub, .mul, .div, .mod => { + const l = try self.evalExpr(bo.left, ctx, pos, size); + const r = try self.evalExpr(bo.right, ctx, pos, size); + const ln = try result.toNumber(self.arena, l); + const rn = try result.toNumber(self.arena, r); + const v: f64 = switch (bo.op) { + .add => ln + rn, + .sub => ln - rn, + .mul => ln * rn, + .div => ln / rn, + // JS `%` and Zig `@rem` agree on sign for finite values + // and propagate NaN (XPath §3.5). + .mod => @rem(ln, rn), + else => unreachable, + }; + return .{ .number = v }; + }, + .union_ => { + const l = try self.evalExpr(bo.left, ctx, pos, size); + const r = try self.evalExpr(bo.right, ctx, pos, size); + if (l != .node_set or r != .node_set) return error.UnionRequiresNodeSets; + var seen: std.AutoArrayHashMapUnmanaged(*Node, void) = .empty; + for (l.node_set) |n| try seen.put(self.arena, n, {}); + for (r.node_set) |n| try seen.put(self.arena, n, {}); + const nodes = seen.keys(); + sortDocOrder(@constCast(nodes)); + return .{ .node_set = nodes }; + }, + } +} + +// ----- comparison (XPath spec §3.4) ----- + +fn xCmp(self: *Evaluator, left: result.Result, right: result.Result, op: ast.BinOpKind) Error!bool { + const is_eq = (op == .eq or op == .neq); + const l_is_set = (left == .node_set); + const r_is_set = (right == .node_set); + + if (l_is_set and r_is_set) { + // Cache right-side string-values once. Without this, each left node + // would pay |right| allocations — O(N×M) for a set×set comparison + // (e.g. `//foo = //bar` on a large page). + const right_strings = try self.arena.alloc([]const u8, right.node_set.len); + for (right.node_set, 0..) |r, i| { + right_strings[i] = try result.stringValueOf(self.arena, r); + } + for (left.node_set) |l| { + const lv = try result.stringValueOf(self.arena, l); + for (right_strings) |rv| { + const matched = if (is_eq) + cmpString(lv, rv, op) + else + cmpNumber(result.stringToNumber(lv), result.stringToNumber(rv), op); + if (matched) return true; + } + } + return false; + } + + if (l_is_set or r_is_set) { + const ns = if (l_is_set) left.node_set else right.node_set; + const other = if (l_is_set) right else left; + const ns_left = l_is_set; + + if (other == .boolean) { + const ns_b = ns.len > 0; + const a, const b = if (ns_left) .{ ns_b, other.boolean } else .{ other.boolean, ns_b }; + return cmpBool(a, b, op); + } + + for (ns) |n| { + const sv = try result.stringValueOf(self.arena, n); + const matched = switch (other) { + .number => |num| blk: { + const sv_num = result.stringToNumber(sv); + const a, const b = if (ns_left) .{ sv_num, num } else .{ num, sv_num }; + break :blk cmpNumber(a, b, op); + }, + .string => |s| blk: { + if (is_eq) { + const a, const b = if (ns_left) .{ sv, s } else .{ s, sv }; + break :blk cmpString(a, b, op); + } + const sv_num = result.stringToNumber(sv); + const s_num = result.stringToNumber(s); + const a, const b = if (ns_left) .{ sv_num, s_num } else .{ s_num, sv_num }; + break :blk cmpNumber(a, b, op); + }, + .boolean, .node_set => unreachable, // handled above + }; + if (matched) return true; + } + return false; + } + + // Neither is a node-set. + if (is_eq) { + if (left == .boolean or right == .boolean) { + return cmpBool(result.toBoolean(left), result.toBoolean(right), op); + } + if (left == .number or right == .number) { + const ln = try result.toNumber(self.arena, left); + const rn = try result.toNumber(self.arena, right); + return cmpNumber(ln, rn, op); + } + const ls = try result.toString(self.arena, left); + const rs = try result.toString(self.arena, right); + return cmpString(ls, rs, op); + } + // Non-eq with no node-set: both → number. + const ln = try result.toNumber(self.arena, left); + const rn = try result.toNumber(self.arena, right); + return cmpNumber(ln, rn, op); +} + +fn cmpString(a: []const u8, b: []const u8, op: ast.BinOpKind) bool { + const equal = std.mem.eql(u8, a, b); + return switch (op) { + .eq => equal, + .neq => !equal, + else => unreachable, // <, > etc. always coerce to number first + }; +} + +fn cmpNumber(a: f64, b: f64, op: ast.BinOpKind) bool { + // Native f64 comparison gives correct NaN semantics: + // NaN == X is false, NaN != X is true, NaN < X (etc.) is false. + return switch (op) { + .eq => a == b, + .neq => a != b, + .lt => a < b, + .gt => a > b, + .lte => a <= b, + .gte => a >= b, + else => unreachable, + }; +} + +fn cmpBool(a: bool, b: bool, op: ast.BinOpKind) bool { + return switch (op) { + .eq => a == b, + .neq => a != b, + else => unreachable, + }; +} + +// ----- function calls ----- + +fn evalFnCall(self: *Evaluator, fc: ast.FnCall, ctx: *Node, pos: usize, size: usize) Error!result.Result { + // position()/last() stay here — they need the (pos, size) closure + // that functions.call doesn't see. Keeping them inline avoids + // pushing per-call context through Functions' signature. + if (std.mem.eql(u8, fc.name, "position")) return .{ .number = @floatFromInt(pos) }; + if (std.mem.eql(u8, fc.name, "last")) return .{ .number = @floatFromInt(size) }; + + // Eagerly evaluate args. Matches the polyfill's `evaluate(args[i], ...)` + // pattern; lazy short-circuit isn't needed because `or`/`and` are + // binops handled in evalBinop, not function calls. + const eval_args = try self.arena.alloc(result.Result, fc.args.len); + for (fc.args, 0..) |a, i| eval_args[i] = try self.evalExpr(a, ctx, pos, size); + + return functions.call(self.arena, fc.name, eval_args, ctx, self.frame); +} + +// ----- helpers ----- + +fn predicateMatches(val: result.Result, position: usize) bool { + return switch (val) { + // Numeric predicate value selects only the node at that position + // (1-based). Non-integer numbers never match. + .number => |n| n == @as(f64, @floatFromInt(position)), + else => result.toBoolean(val), + }; +} + +pub fn sortDocOrder(nodes: []*Node) void { + if (nodes.len <= 1) return; + std.mem.sort(*Node, nodes, {}, lessThanDocOrder); +} + +fn lessThanDocOrder(_: void, a: *Node, b: *Node) bool { + if (a == b) return false; + const pos = a.compareDocumentPosition(b); + // FOLLOWING (0x04) — b comes after a in document order. + return (pos & 0x04) != 0; +} + +// --------------------------------------------------------------------- +// Tests — pure-logic only. DOM-dependent evaluation lands as HTML +// fixtures in Phase 9 (tests/xpath/*.html); Lightpanda has no in-Zig +// way to construct a Frame + Document tree without the JS runtime. +// --------------------------------------------------------------------- + +const testing = std.testing; +const Tokenizer = @import("Tokenizer.zig"); + +test "Evaluator: cmpNumber NaN semantics" { + const nan = std.math.nan(f64); + try testing.expect(!cmpNumber(nan, nan, .eq)); + try testing.expect(cmpNumber(nan, nan, .neq)); + try testing.expect(!cmpNumber(nan, 0, .lt)); + try testing.expect(!cmpNumber(nan, 0, .gt)); + try testing.expect(!cmpNumber(nan, 0, .lte)); + try testing.expect(!cmpNumber(nan, 0, .gte)); + try testing.expect(cmpNumber(0, 0, .eq)); + try testing.expect(cmpNumber(1, 2, .lt)); + try testing.expect(cmpNumber(2, 1, .gt)); + try testing.expect(cmpNumber(1, 1, .lte)); + try testing.expect(cmpNumber(1, 1, .gte)); +} + +test "Evaluator: cmpString" { + try testing.expect(cmpString("a", "a", .eq)); + try testing.expect(!cmpString("a", "b", .eq)); + try testing.expect(cmpString("a", "b", .neq)); + try testing.expect(!cmpString("a", "a", .neq)); +} + +test "Evaluator: cmpBool" { + try testing.expect(cmpBool(true, true, .eq)); + try testing.expect(!cmpBool(true, false, .eq)); + try testing.expect(cmpBool(true, false, .neq)); +} + +test "Evaluator: predicateMatches numeric vs boolean" { + try testing.expect(predicateMatches(.{ .number = 1 }, 1)); + try testing.expect(!predicateMatches(.{ .number = 2 }, 1)); + // Non-integer never matches. + try testing.expect(!predicateMatches(.{ .number = 1.5 }, 1)); + // Boolean: any truthy value passes regardless of position. + try testing.expect(predicateMatches(.{ .boolean = true }, 7)); + try testing.expect(!predicateMatches(.{ .boolean = false }, 1)); + // String: nonempty truthy. + try testing.expect(predicateMatches(.{ .string = "x" }, 99)); + try testing.expect(!predicateMatches(.{ .string = "" }, 1)); + // Empty node-set: falsy. + try testing.expect(!predicateMatches(.{ .node_set = &.{} }, 1)); +} + +test "Evaluator: scalar arithmetic via parsed expressions" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + .{ "1 + 2", 3 }, + .{ "5 - 3", 2 }, + .{ "4 * 2", 8 }, + .{ "10 div 4", 2.5 }, + .{ "10 mod 3", 1 }, + .{ "-5", -5 }, + .{ "1 + 2 * 3", 7 }, + }) |case| { + const expr = try Parser.parse(a, case[0]); + // Frame is unused for pure-arithmetic AST. The unsafe cast lets + // us exercise binop / number paths without a real DOM. Any path + // accessing the Frame would crash; the inputs above never do. + var ev = Evaluator{ .arena = a, .frame = @ptrFromInt(0x1000) }; + const ctx_dummy: *Node = @ptrFromInt(0x2000); + const r = try ev.evalExpr(expr, ctx_dummy, 1, 1); + try testing.expect(r == .number); + try testing.expectEqual(@as(f64, case[1]), r.number); + } +} + +test "Evaluator: scalar comparison via parsed expressions" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + .{ "1 = 1", true }, + .{ "1 = 2", false }, + .{ "1 != 2", true }, + .{ "1 < 2", true }, + .{ "2 < 1", false }, + .{ "1 <= 1", true }, + .{ "2 >= 2", true }, + .{ "'abc' = 'abc'", true }, + .{ "'abc' != 'abd'", true }, + }) |case| { + const expr = try Parser.parse(a, case[0]); + var ev = Evaluator{ .arena = a, .frame = @ptrFromInt(0x1000) }; + const ctx_dummy: *Node = @ptrFromInt(0x2000); + const r = try ev.evalExpr(expr, ctx_dummy, 1, 1); + try testing.expect(r == .boolean); + try testing.expectEqual(case[1], r.boolean); + } +} + +test "Evaluator: position() and last() reflect context" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + const ctx_dummy: *Node = @ptrFromInt(0x2000); + + { + const expr = try Parser.parse(a, "position()"); + var ev = Evaluator{ .arena = a, .frame = @ptrFromInt(0x1000) }; + const r = try ev.evalExpr(expr, ctx_dummy, 3, 5); + try testing.expectEqual(@as(f64, 3), r.number); + } + { + const expr = try Parser.parse(a, "last()"); + var ev = Evaluator{ .arena = a, .frame = @ptrFromInt(0x1000) }; + const r = try ev.evalExpr(expr, ctx_dummy, 3, 5); + try testing.expectEqual(@as(f64, 5), r.number); + } + { + // Logical short-circuit: last() never evaluates if first + // operand is true. + const expr = try Parser.parse(a, "1 = 1 or last() > 0"); + var ev = Evaluator{ .arena = a, .frame = @ptrFromInt(0x1000) }; + const r = try ev.evalExpr(expr, ctx_dummy, 1, 1); + try testing.expect(r.boolean); + } +} + +test "Evaluator: short-circuit and/or" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + const ctx_dummy: *Node = @ptrFromInt(0x2000); + + inline for (.{ + .{ "1 = 2 or 1 = 1", true }, + .{ "1 = 1 and 1 = 2", false }, + .{ "1 = 1 and 2 = 2", true }, + .{ "1 = 2 and 1 = 1", false }, + .{ "1 = 2 or 2 = 1", false }, + }) |case| { + const expr = try Parser.parse(a, case[0]); + var ev = Evaluator{ .arena = a, .frame = @ptrFromInt(0x1000) }; + const r = try ev.evalExpr(expr, ctx_dummy, 1, 1); + try testing.expect(r == .boolean); + try testing.expectEqual(case[1], r.boolean); + } +} + +test "Evaluator: unary minus" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + const ctx_dummy: *Node = @ptrFromInt(0x2000); + + const expr = try Parser.parse(a, "-(3 + 2)"); + var ev = Evaluator{ .arena = a, .frame = @ptrFromInt(0x1000) }; + const r = try ev.evalExpr(expr, ctx_dummy, 1, 1); + try testing.expectEqual(@as(f64, -5), r.number); +} + +test "Evaluator: division by zero produces infinity / NaN per IEEE" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + const ctx_dummy: *Node = @ptrFromInt(0x2000); + + { + const expr = try Parser.parse(a, "1 div 0"); + var ev = Evaluator{ .arena = a, .frame = @ptrFromInt(0x1000) }; + const r = try ev.evalExpr(expr, ctx_dummy, 1, 1); + try testing.expect(std.math.isPositiveInf(r.number)); + } + { + const expr = try Parser.parse(a, "0 div 0"); + var ev = Evaluator{ .arena = a, .frame = @ptrFromInt(0x1000) }; + const r = try ev.evalExpr(expr, ctx_dummy, 1, 1); + try testing.expect(std.math.isNan(r.number)); + } +} + +test "Evaluator: searchAll on scalar expression returns empty (decision #3)" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + // Synthetic frame/root pointers are safe here because pure-scalar + // expressions (binop, literal, true(), comparison) never reach into + // the Frame or the context node. Adding a DOM-touching expression + // (e.g. `id('x')`) to this list would crash on dereference. + inline for (.{ "1 + 2", "'hello'", "true()", "1 = 1" }) |expr| { + const nodes = try searchAll(a, @ptrFromInt(0x2000), expr, @ptrFromInt(0x1000)); + try testing.expectEqual(@as(usize, 0), nodes.len); + } +} diff --git a/src/browser/xpath/Parser.zig b/src/browser/xpath/Parser.zig new file mode 100644 index 00000000..60c82df5 --- /dev/null +++ b/src/browser/xpath/Parser.zig @@ -0,0 +1,957 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! XPath 1.0 expression parser. +//! +//! Recursive descent over a fully-tokenized stream, producing an +//! `ast.Expr` tree allocated on the caller's arena. The AST borrows +//! string/name slices from `input` and is valid for as long as the +//! arena and input outlive it. + +const std = @import("std"); + +const Tokenizer = @import("Tokenizer.zig"); +const ast = @import("ast.zig"); + +const Token = Tokenizer.Token; +const Allocator = std.mem.Allocator; + +const Parser = @This(); + +pub const Error = error{ + OutOfMemory, + UnexpectedToken, + ExpectedNodeTest, + ExpectedPrimaryExpr, + MaxDepthExceeded, +}; + +/// Cap recursive descent to keep adversarial input (e.g. `(((((...)))))`, +/// `------5`) from blowing the stack. Real XPath expressions never come +/// close to this; browsers typically allow several hundred. +const max_depth: u16 = 64; + +arena: Allocator, +tokens: []const Token, +pos: usize = 0, +depth: u16 = 0, + +pub fn parse(arena: Allocator, input: []const u8) Error!*ast.Expr { + var token_list: std.ArrayList(Token) = .empty; + // Token count is bounded by input length; ¼-byte-per-token is + // generous for typical XPath and skips ArrayList regrowth. + try token_list.ensureTotalCapacity(arena, @max(8, input.len / 4)); + var tokenizer = Tokenizer{ .input = input }; + while (true) { + const tok = tokenizer.next(); + try token_list.append(arena, tok); + if (tok == .eof) break; + } + + var parser = Parser{ + .arena = arena, + .tokens = token_list.items, + }; + const expr = try parser.parseExpr(); + if (parser.peek() != .eof) return error.UnexpectedToken; + return expr; +} + +// --- token cursor helpers --- + +fn peek(self: *const Parser) Token { + return self.tokens[self.pos]; +} + +fn lookahead(self: *const Parser, offset: usize) Token { + const idx = self.pos + offset; + if (idx >= self.tokens.len) return .eof; + return self.tokens[idx]; +} + +fn advance(self: *Parser) Token { + const tok = self.tokens[self.pos]; + self.pos += 1; + return tok; +} + +fn at(self: *const Parser, tag: std.meta.Tag(Token)) bool { + return self.peek() == tag; +} + +fn match(self: *Parser, tag: std.meta.Tag(Token)) bool { + if (self.at(tag)) { + _ = self.advance(); + return true; + } + return false; +} + +fn expect(self: *Parser, tag: std.meta.Tag(Token)) Error!Token { + if (!self.at(tag)) return error.UnexpectedToken; + return self.advance(); +} + +fn matchKeyword(self: *Parser, keyword: []const u8) bool { + const tok = self.peek(); + if (tok == .name and std.mem.eql(u8, tok.name, keyword)) { + _ = self.advance(); + return true; + } + return false; +} + +fn makeExpr(self: *Parser, value: ast.Expr) Error!*ast.Expr { + const expr = try self.arena.create(ast.Expr); + expr.* = value; + return expr; +} + +fn makeBinop(self: *Parser, op: ast.BinOpKind, left: *ast.Expr, right: *ast.Expr) Error!*ast.Expr { + return try self.makeExpr(.{ .binop = .{ .op = op, .left = left, .right = right } }); +} + +// --- operator-precedence chain --- +// +// Or → And → Equality → Relational → Additive → Mult → Unary → Union → Path + +fn parseExpr(self: *Parser) Error!*ast.Expr { + if (self.depth >= max_depth) return error.MaxDepthExceeded; + self.depth += 1; + defer self.depth -= 1; + return self.parseOrExpr(); +} + +fn parseOrExpr(self: *Parser) Error!*ast.Expr { + var left = try self.parseAndExpr(); + while (self.matchKeyword("or")) { + const right = try self.parseAndExpr(); + left = try self.makeBinop(.or_, left, right); + } + return left; +} + +fn parseAndExpr(self: *Parser) Error!*ast.Expr { + var left = try self.parseEqualityExpr(); + while (self.matchKeyword("and")) { + const right = try self.parseEqualityExpr(); + left = try self.makeBinop(.and_, left, right); + } + return left; +} + +fn parseEqualityExpr(self: *Parser) Error!*ast.Expr { + var left = try self.parseRelationalExpr(); + while (equalityOp(self.peek())) |op| { + _ = self.advance(); + const right = try self.parseRelationalExpr(); + left = try self.makeBinop(op, left, right); + } + return left; +} + +fn parseRelationalExpr(self: *Parser) Error!*ast.Expr { + var left = try self.parseAdditiveExpr(); + while (relationalOp(self.peek())) |op| { + _ = self.advance(); + const right = try self.parseAdditiveExpr(); + left = try self.makeBinop(op, left, right); + } + return left; +} + +fn parseAdditiveExpr(self: *Parser) Error!*ast.Expr { + var left = try self.parseMultExpr(); + while (additiveOp(self.peek())) |op| { + _ = self.advance(); + const right = try self.parseMultExpr(); + left = try self.makeBinop(op, left, right); + } + return left; +} + +// After a complete unary expression, `*` is multiply; `div`/`mod` are +// operator-position keywords (tokenized as Name). +fn parseMultExpr(self: *Parser) Error!*ast.Expr { + var left = try self.parseUnaryExpr(); + while (multOp(self.peek())) |op| { + _ = self.advance(); + const right = try self.parseUnaryExpr(); + left = try self.makeBinop(op, left, right); + } + return left; +} + +fn parseUnaryExpr(self: *Parser) Error!*ast.Expr { + if (self.match(.minus)) { + if (self.depth >= max_depth) return error.MaxDepthExceeded; + self.depth += 1; + defer self.depth -= 1; + const operand = try self.parseUnaryExpr(); + return try self.makeExpr(.{ .neg = operand }); + } + return self.parseUnionExpr(); +} + +fn parseUnionExpr(self: *Parser) Error!*ast.Expr { + var left = try self.parsePathExpr(); + while (self.match(.pipe)) { + const right = try self.parsePathExpr(); + left = try self.makeBinop(.union_, left, right); + } + return left; +} + +// --- path expressions --- + +fn parsePathExpr(self: *Parser) Error!*ast.Expr { + const t = self.peek(); + + if (t == .slash or t == .double_slash) { + return self.parseAbsPath(); + } + + // Filter-vs-relative-path disambiguation: a primary expression + // starts with `(`, string, number, `$`, or a `name(` where the + // name is *not* a node-type test (`node`/`text`/`comment`/`processing-instruction`). + const is_filter = switch (t) { + .lparen, .string, .number, .dollar => true, + .name => |name| self.lookahead(1) == .lparen and !isNodeTypeName(name), + else => false, + }; + + if (is_filter) { + var primary = try self.parsePrimaryExpr(); + while (self.match(.lbracket)) { + const pred = try self.parseExpr(); + _ = try self.expect(.rbracket); + primary = try self.makeExpr(.{ .filter = .{ .expr = primary, .predicate = pred } }); + } + if (self.peek() == .slash or self.peek() == .double_slash) { + const dsl = self.advance() == .double_slash; + var steps: std.ArrayList(ast.Step) = .empty; + if (dsl) try steps.append(self.arena, descendantOrSelfStep()); + try self.parseRelStepsInto(&steps); + return try self.makeExpr(.{ .filter_path = .{ + .filter = primary, + .steps = steps.items, + } }); + } + return primary; + } + + return self.parseRelPath(); +} + +fn parseAbsPath(self: *Parser) Error!*ast.Expr { + var steps: std.ArrayList(ast.Step) = .empty; + if (self.match(.double_slash)) { + try steps.append(self.arena, descendantOrSelfStep()); + try self.parseRelStepsInto(&steps); + } else { + _ = try self.expect(.slash); + // `/` alone is the document root — no step required. + if (self.canStartStep()) try self.parseRelStepsInto(&steps); + } + return try self.makeExpr(.{ .path = .{ + .absolute = true, + .steps = steps.items, + } }); +} + +fn parseRelPath(self: *Parser) Error!*ast.Expr { + var steps: std.ArrayList(ast.Step) = .empty; + try self.parseRelStepsInto(&steps); + return try self.makeExpr(.{ .path = .{ + .absolute = false, + .steps = steps.items, + } }); +} + +fn parseRelStepsInto(self: *Parser, steps: *std.ArrayList(ast.Step)) Error!void { + try steps.append(self.arena, try self.parseStep()); + while (self.peek() == .slash or self.peek() == .double_slash) { + if (self.advance() == .double_slash) { + try steps.append(self.arena, descendantOrSelfStep()); + } + try steps.append(self.arena, try self.parseStep()); + } +} + +fn canStartStep(self: *const Parser) bool { + return switch (self.peek()) { + .name, .star, .dot, .double_dot, .at => true, + else => false, + }; +} + +fn parseStep(self: *Parser) Error!ast.Step { + // Abbreviated steps `.` and `..` carry no axis, node-test, or + // predicates — predicates after `.` are a parse error per polyfill. + if (self.match(.dot)) return abbreviatedStep(.self); + if (self.match(.double_dot)) return abbreviatedStep(.parent); + + var axis: ast.Axis = .child; + if (self.match(.at)) { + axis = .attribute; + } else if (self.peek() == .name and self.lookahead(1) == .double_colon) { + const axis_name = self.advance().name; + _ = self.advance(); // `::` + axis = parseAxisName(axis_name); + } + + const node_test = try self.parseNodeTest(); + + var preds: std.ArrayList(*ast.Expr) = .empty; + while (self.match(.lbracket)) { + const pred = try self.parseExpr(); + _ = try self.expect(.rbracket); + try preds.append(self.arena, pred); + } + + return .{ .axis = axis, .node_test = node_test, .predicates = preds.items }; +} + +fn parseNodeTest(self: *Parser) Error!ast.NodeTest { + if (self.match(.star)) return .{ .name = "*" }; + if (self.peek() != .name) return error.ExpectedNodeTest; + + const name = self.peek().name; + if (typeTestKind(name)) |type_test| { + if (self.lookahead(1) == .lparen) { + _ = self.advance(); // name + _ = self.advance(); // `(` + // `processing-instruction("target")` consumes the literal but ignores it (decision #3 stub). + if (type_test == .processing_instruction and self.peek() == .string) { + _ = self.advance(); + } + _ = try self.expect(.rparen); + return .{ .type_test = type_test }; + } + } + _ = self.advance(); + return .{ .name = name }; +} + +fn parsePrimaryExpr(self: *Parser) Error!*ast.Expr { + switch (self.peek()) { + .string => |s| { + _ = self.advance(); + return try self.makeExpr(.{ .literal = s }); + }, + .number => |n| { + _ = self.advance(); + return try self.makeExpr(.{ .number = n }); + }, + .dollar => { + _ = self.advance(); + const name_tok = try self.expect(.name); + return try self.makeExpr(.{ .var_ref = name_tok.name }); + }, + .lparen => { + _ = self.advance(); + const e = try self.parseExpr(); + _ = try self.expect(.rparen); + return e; + }, + .name => |name| { + _ = self.advance(); + _ = try self.expect(.lparen); + var args: std.ArrayList(*ast.Expr) = .empty; + if (self.peek() != .rparen) { + try args.append(self.arena, try self.parseExpr()); + while (self.match(.comma)) { + try args.append(self.arena, try self.parseExpr()); + } + } + _ = try self.expect(.rparen); + return try self.makeExpr(.{ .fn_call = .{ .name = name, .args = args.items } }); + }, + else => return error.ExpectedPrimaryExpr, + } +} + +// --- pure helpers --- + +fn equalityOp(t: Token) ?ast.BinOpKind { + return switch (t) { + .eq => .eq, + .neq => .neq, + else => null, + }; +} + +fn relationalOp(t: Token) ?ast.BinOpKind { + return switch (t) { + .lt => .lt, + .gt => .gt, + .lte => .lte, + .gte => .gte, + else => null, + }; +} + +fn additiveOp(t: Token) ?ast.BinOpKind { + return switch (t) { + .plus => .add, + .minus => .sub, + else => null, + }; +} + +fn multOp(t: Token) ?ast.BinOpKind { + return switch (t) { + .star => .mul, + .name => |name| blk: { + if (std.mem.eql(u8, name, "div")) break :blk .div; + if (std.mem.eql(u8, name, "mod")) break :blk .mod; + break :blk null; + }, + else => null, + }; +} + +fn descendantOrSelfStep() ast.Step { + return .{ + .axis = .descendant_or_self, + .node_test = .{ .type_test = .node }, + .predicates = &.{}, + }; +} + +fn abbreviatedStep(axis: ast.Axis) ast.Step { + return .{ + .axis = axis, + .node_test = .{ .type_test = .node }, + .predicates = &.{}, + }; +} + +fn isNodeTypeName(name: []const u8) bool { + return typeTestKind(name) != null; +} + +const type_test_lookup = std.StaticStringMap(ast.TypeTest).initComptime(.{ + .{ "node", .node }, + .{ "text", .text }, + .{ "comment", .comment }, + .{ "processing-instruction", .processing_instruction }, +}); + +fn typeTestKind(name: []const u8) ?ast.TypeTest { + return type_test_lookup.get(name); +} + +const axis_lookup = std.StaticStringMap(ast.Axis).initComptime(.{ + .{ "child", .child }, + .{ "descendant", .descendant }, + .{ "descendant-or-self", .descendant_or_self }, + .{ "self", .self }, + .{ "parent", .parent }, + .{ "ancestor", .ancestor }, + .{ "ancestor-or-self", .ancestor_or_self }, + .{ "following-sibling", .following_sibling }, + .{ "preceding-sibling", .preceding_sibling }, + .{ "following", .following }, + .{ "preceding", .preceding }, + .{ "attribute", .attribute }, + .{ "namespace", .namespace }, +}); + +fn parseAxisName(name: []const u8) ast.Axis { + return axis_lookup.get(name) orelse .unknown; +} + +// --------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------- + +const testing = std.testing; + +fn parseFixture(input: []const u8) !struct { arena: std.heap.ArenaAllocator, expr: *ast.Expr } { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + errdefer arena.deinit(); + const expr = try parse(arena.allocator(), input); + return .{ .arena = arena, .expr = expr }; +} + +test "XPath.Parser: number literal" { + var fx = try parseFixture("42"); + defer fx.arena.deinit(); + try testing.expectEqual(@as(f64, 42), fx.expr.number); +} + +test "XPath.Parser: string literal" { + var fx = try parseFixture("'hello'"); + defer fx.arena.deinit(); + try testing.expectEqualStrings("hello", fx.expr.literal); +} + +test "XPath.Parser: variable reference strips $" { + var fx = try parseFixture("$x"); + defer fx.arena.deinit(); + try testing.expectEqualStrings("x", fx.expr.var_ref); +} + +test "XPath.Parser: parenthesized expression unwraps" { + var fx = try parseFixture("(42)"); + defer fx.arena.deinit(); + try testing.expectEqual(@as(f64, 42), fx.expr.number); +} + +test "XPath.Parser: function call with no args" { + var fx = try parseFixture("position()"); + defer fx.arena.deinit(); + try testing.expectEqualStrings("position", fx.expr.fn_call.name); + try testing.expectEqual(@as(usize, 0), fx.expr.fn_call.args.len); +} + +test "XPath.Parser: function call with args" { + var fx = try parseFixture("substring('abc', 2, 1)"); + defer fx.arena.deinit(); + const fc = fx.expr.fn_call; + try testing.expectEqualStrings("substring", fc.name); + try testing.expectEqual(@as(usize, 3), fc.args.len); + try testing.expectEqualStrings("abc", fc.args[0].literal); + try testing.expectEqual(@as(f64, 2), fc.args[1].number); + try testing.expectEqual(@as(f64, 1), fc.args[2].number); +} + +test "XPath.Parser: arithmetic precedence — mul binds tighter than add" { + var fx = try parseFixture("1 + 2 * 3"); + defer fx.arena.deinit(); + // Expected AST: add(1, mul(2, 3)) + const top = fx.expr.binop; + try testing.expectEqual(ast.BinOpKind.add, top.op); + try testing.expectEqual(@as(f64, 1), top.left.number); + const mul = top.right.binop; + try testing.expectEqual(ast.BinOpKind.mul, mul.op); + try testing.expectEqual(@as(f64, 2), mul.left.number); + try testing.expectEqual(@as(f64, 3), mul.right.number); +} + +test "XPath.Parser: arithmetic left-associativity" { + var fx = try parseFixture("1 - 2 - 3"); + defer fx.arena.deinit(); + // Expected AST: sub(sub(1, 2), 3) + const top = fx.expr.binop; + try testing.expectEqual(ast.BinOpKind.sub, top.op); + try testing.expectEqual(@as(f64, 3), top.right.number); + const inner = top.left.binop; + try testing.expectEqual(ast.BinOpKind.sub, inner.op); + try testing.expectEqual(@as(f64, 1), inner.left.number); + try testing.expectEqual(@as(f64, 2), inner.right.number); +} + +test "XPath.Parser: div and mod are operator-position keywords" { + var fx = try parseFixture("7 div 2"); + defer fx.arena.deinit(); + try testing.expectEqual(ast.BinOpKind.div, fx.expr.binop.op); + + var fx2 = try parseFixture("7 mod 2"); + defer fx2.arena.deinit(); + try testing.expectEqual(ast.BinOpKind.mod, fx2.expr.binop.op); +} + +test "XPath.Parser: comparison operators" { + inline for (.{ + .{ "1 = 2", ast.BinOpKind.eq }, + .{ "1 != 2", ast.BinOpKind.neq }, + .{ "1 < 2", ast.BinOpKind.lt }, + .{ "1 <= 2", ast.BinOpKind.lte }, + .{ "1 > 2", ast.BinOpKind.gt }, + .{ "1 >= 2", ast.BinOpKind.gte }, + }) |case| { + var fx = try parseFixture(case[0]); + defer fx.arena.deinit(); + try testing.expectEqual(case[1], fx.expr.binop.op); + } +} + +test "XPath.Parser: logical or/and short-circuit chain" { + var fx = try parseFixture("a or b and c"); + defer fx.arena.deinit(); + // Expected AST: or(path(a), and(path(b), path(c))) — and binds tighter + const top = fx.expr.binop; + try testing.expectEqual(ast.BinOpKind.or_, top.op); + try testing.expectEqual(ast.BinOpKind.and_, top.right.binop.op); +} + +test "XPath.Parser: unary minus" { + var fx = try parseFixture("-1"); + defer fx.arena.deinit(); + try testing.expectEqual(@as(f64, 1), fx.expr.neg.number); +} + +test "XPath.Parser: union" { + var fx = try parseFixture("a | b"); + defer fx.arena.deinit(); + try testing.expectEqual(ast.BinOpKind.union_, fx.expr.binop.op); +} + +test "XPath.Parser: absolute path / alone is document root" { + var fx = try parseFixture("/"); + defer fx.arena.deinit(); + const path = fx.expr.path; + try testing.expect(path.absolute); + try testing.expectEqual(@as(usize, 0), path.steps.len); +} + +test "XPath.Parser: absolute path /foo" { + var fx = try parseFixture("/foo"); + defer fx.arena.deinit(); + const path = fx.expr.path; + try testing.expect(path.absolute); + try testing.expectEqual(@as(usize, 1), path.steps.len); + try testing.expectEqualStrings("foo", path.steps[0].node_test.name); +} + +test "XPath.Parser: //foo expands to descendant-or-self::node()/foo" { + var fx = try parseFixture("//foo"); + defer fx.arena.deinit(); + const path = fx.expr.path; + try testing.expect(path.absolute); + try testing.expectEqual(@as(usize, 2), path.steps.len); + try testing.expectEqual(ast.Axis.descendant_or_self, path.steps[0].axis); + try testing.expectEqual(ast.TypeTest.node, path.steps[0].node_test.type_test); + try testing.expectEqualStrings("foo", path.steps[1].node_test.name); +} + +test "XPath.Parser: relative path child::foo/bar" { + var fx = try parseFixture("foo/bar"); + defer fx.arena.deinit(); + const path = fx.expr.path; + try testing.expect(!path.absolute); + try testing.expectEqual(@as(usize, 2), path.steps.len); + try testing.expectEqual(ast.Axis.child, path.steps[0].axis); + try testing.expectEqualStrings("foo", path.steps[0].node_test.name); + try testing.expectEqualStrings("bar", path.steps[1].node_test.name); +} + +test "XPath.Parser: abbreviated steps . and .." { + var fx = try parseFixture("./.."); + defer fx.arena.deinit(); + const path = fx.expr.path; + try testing.expectEqual(@as(usize, 2), path.steps.len); + try testing.expectEqual(ast.Axis.self, path.steps[0].axis); + try testing.expectEqual(ast.Axis.parent, path.steps[1].axis); +} + +test "XPath.Parser: attribute axis @class" { + var fx = try parseFixture("@class"); + defer fx.arena.deinit(); + const step = fx.expr.path.steps[0]; + try testing.expectEqual(ast.Axis.attribute, step.axis); + try testing.expectEqualStrings("class", step.node_test.name); +} + +test "XPath.Parser: all 12 named axes parse correctly" { + inline for (.{ + .{ "child::a", ast.Axis.child }, + .{ "descendant::a", ast.Axis.descendant }, + .{ "descendant-or-self::a", ast.Axis.descendant_or_self }, + .{ "self::a", ast.Axis.self }, + .{ "parent::a", ast.Axis.parent }, + .{ "ancestor::a", ast.Axis.ancestor }, + .{ "ancestor-or-self::a", ast.Axis.ancestor_or_self }, + .{ "following-sibling::a", ast.Axis.following_sibling }, + .{ "preceding-sibling::a", ast.Axis.preceding_sibling }, + .{ "following::a", ast.Axis.following }, + .{ "preceding::a", ast.Axis.preceding }, + .{ "namespace::a", ast.Axis.namespace }, + }) |case| { + var fx = try parseFixture(case[0]); + defer fx.arena.deinit(); + try testing.expectEqual(case[1], fx.expr.path.steps[0].axis); + } +} + +test "XPath.Parser: unknown axis name maps to .unknown — polyfill parity" { + var fx = try parseFixture("wibble::a"); + defer fx.arena.deinit(); + try testing.expectEqual(ast.Axis.unknown, fx.expr.path.steps[0].axis); +} + +test "XPath.Parser: wildcard *" { + var fx = try parseFixture("*"); + defer fx.arena.deinit(); + try testing.expectEqualStrings("*", fx.expr.path.steps[0].node_test.name); +} + +test "XPath.Parser: namespace-prefixed name and wildcard" { + var fx = try parseFixture("svg:rect"); + defer fx.arena.deinit(); + try testing.expectEqualStrings("svg:rect", fx.expr.path.steps[0].node_test.name); + + var fx2 = try parseFixture("svg:*"); + defer fx2.arena.deinit(); + try testing.expectEqualStrings("svg:*", fx2.expr.path.steps[0].node_test.name); +} + +test "XPath.Parser: node-type tests" { + inline for (.{ + .{ "node()", ast.TypeTest.node }, + .{ "text()", ast.TypeTest.text }, + .{ "comment()", ast.TypeTest.comment }, + .{ "processing-instruction()", ast.TypeTest.processing_instruction }, + }) |case| { + var fx = try parseFixture(case[0]); + defer fx.arena.deinit(); + try testing.expectEqual(case[1], fx.expr.path.steps[0].node_test.type_test); + } +} + +test "XPath.Parser: processing-instruction with literal target — consumed but ignored" { + var fx = try parseFixture("processing-instruction('xml-stylesheet')"); + defer fx.arena.deinit(); + try testing.expectEqual(ast.TypeTest.processing_instruction, fx.expr.path.steps[0].node_test.type_test); +} + +test "XPath.Parser: predicate on step" { + var fx = try parseFixture("p[1]"); + defer fx.arena.deinit(); + const step = fx.expr.path.steps[0]; + try testing.expectEqual(@as(usize, 1), step.predicates.len); + try testing.expectEqual(@as(f64, 1), step.predicates[0].number); +} + +test "XPath.Parser: multi-predicate step" { + var fx = try parseFixture("p[1][@x]"); + defer fx.arena.deinit(); + const step = fx.expr.path.steps[0]; + try testing.expectEqual(@as(usize, 2), step.predicates.len); +} + +test "XPath.Parser: filter expression with predicate parses as Filter, not Step" { + var fx = try parseFixture("(//a)[1]"); + defer fx.arena.deinit(); + // Top level is Filter wrapping a parenthesized path with one predicate. + const filt = fx.expr.filter; + try testing.expectEqual(@as(f64, 1), filt.predicate.number); + try testing.expect(filt.expr.path.absolute); +} + +test "XPath.Parser: filter with multi-predicate nests" { + var fx = try parseFixture("(//a)[1][2]"); + defer fx.arena.deinit(); + const outer = fx.expr.filter; + try testing.expectEqual(@as(f64, 2), outer.predicate.number); + const inner = outer.expr.filter; + try testing.expectEqual(@as(f64, 1), inner.predicate.number); +} + +test "XPath.Parser: filter with location-path tail (filter_path)" { + var fx = try parseFixture("(//a)/b"); + defer fx.arena.deinit(); + const fp = fx.expr.filter_path; + try testing.expect(fp.filter.path.absolute); + try testing.expectEqual(@as(usize, 1), fp.steps.len); + try testing.expectEqualStrings("b", fp.steps[0].node_test.name); +} + +test "XPath.Parser: filter with // tail prepends descendant-or-self" { + var fx = try parseFixture("(//a)//b"); + defer fx.arena.deinit(); + const fp = fx.expr.filter_path; + try testing.expectEqual(@as(usize, 2), fp.steps.len); + try testing.expectEqual(ast.Axis.descendant_or_self, fp.steps[0].axis); + try testing.expectEqualStrings("b", fp.steps[1].node_test.name); +} + +test "XPath.Parser: function call followed by predicate" { + var fx = try parseFixture("id('x')[1]"); + defer fx.arena.deinit(); + const filt = fx.expr.filter; + try testing.expectEqual(@as(f64, 1), filt.predicate.number); + try testing.expectEqualStrings("id", filt.expr.fn_call.name); +} + +test "XPath.Parser: complex representative expression" { + var fx = try parseFixture("//div[@class='active']/p[position()<=last()-1]"); + defer fx.arena.deinit(); + const path = fx.expr.path; + try testing.expect(path.absolute); + try testing.expectEqual(@as(usize, 3), path.steps.len); + try testing.expectEqual(ast.Axis.descendant_or_self, path.steps[0].axis); + try testing.expectEqualStrings("div", path.steps[1].node_test.name); + try testing.expectEqual(@as(usize, 1), path.steps[1].predicates.len); + try testing.expectEqualStrings("p", path.steps[2].node_test.name); + try testing.expectEqual(@as(usize, 1), path.steps[2].predicates.len); +} + +fn expectParseError(input: []const u8, expected: anyerror) !void { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + try testing.expectError(expected, parse(arena.allocator(), input)); +} + +test "XPath.Parser: error on unbalanced paren" { + try expectParseError("(1", error.UnexpectedToken); +} + +test "XPath.Parser: error on unbalanced bracket" { + try expectParseError("p[1", error.UnexpectedToken); +} + +test "XPath.Parser: error on missing node test" { + try expectParseError("child::", error.ExpectedNodeTest); +} + +test "XPath.Parser: bare `+` falls through to step and reports missing node test" { + // Matches polyfill: + isn't a path/primary start, so the parser + // ends up in parseStep with no name to use as node test. + try expectParseError("+", error.ExpectedNodeTest); +} + +test "XPath.Parser: error on trailing tokens" { + try expectParseError("1 2", error.UnexpectedToken); +} + +test "XPath.Parser: empty string falls through to step and reports missing node test" { + try expectParseError("", error.ExpectedNodeTest); +} + +test "XPath.Parser: 91-case battery — every expression parses" { + // 91-case XPath 1.0 conformance battery covering every expression + // shape the public API surface accepts. Each entry must parse + // without error. + const battery = [_][]const u8{ + "/html", + "/html/body", + "/", + "//h1", + "//ul/li", + "//ul//li", + ".", + ".//li", + "//section/*", + "//*[@id='heading']", + "//li[1]/following-sibling::li", + "//li[5]/preceding-sibling::li", + "//li/parent::ul", + "//li/ancestor::body", + "//li/ancestor-or-self::body", + "//li[3]/preceding::li", + "//li[1]/following::li", + "//ul/descendant::li", + "//ul/descendant-or-self::li", + "//section[1]/child::span", + "//*[@id='heading']/self::h1", + "//a[1]/attribute::href", + "//a[1]/@*", + "//li[1]", + "//li[last()]", + "//li[last() - 1]", + "//li[position() = 1]", + "//li[position() > 2]", + "//li[position() mod 2 = 1]", + "(//li)[1]", + "(//section)[2]", + "//li[3]/preceding-sibling::li[1]", + "//li[5]/ancestor::*[1]", + "//li[contains(concat(' ', @class, ' '), ' even ')][2]", + "//*[@id='heading' and @class='primary']", + "//*[@id='heading' or @id='p1']", + "//section[a]", + "//section[count(span) = 2]", + "//ul[count(li) = 5]", + "//tr[td[1]]", + "//tr[td/text() = 'Bob']", + "//*[starts-with(@id, 'link')]", + "//*[normalize-space() = 'Hello World']", + "//*[normalize-space(.) = 'Item 1']", + "//*[concat(@id, '-x') = 'heading-x']", + "//*[substring(@id, 1, 1) = 'p']", + "//*[substring(@id, 2, 1) = '1' and starts-with(@id, 'p')]", + "//p[translate(@id, 'p', 'q') = 'q1']", + "//*[substring-before(@id, '1') = 'p']", + "//*[substring-after(@id, 'lin') = 'k1']", + "//tr[number(td[2]) > 28]", + "//tr[floor(number(td[2]) div 10) = 3]", + "//tr[ceiling(number(td[2]) div 10) = 3]", + "//tr[round(number(td[2]) div 10) = 3]", + "//ul[sum(li/@data-len) = 0]", + "//p[boolean(@lang)]", + "//*[false()]", + "//*[name() = 'h1']", + "//*[local-name() = 'h1']", + "id('heading')", + "id('heading p1')", + "id(//em/parent::p/@id)", + "//h1 | //title", + "//h1 | //*[@id='p1']", + "//*[@id='heading'] | //*[@id='heading']", + "//li[position() + 1 = 3]", + "//li[position() - 1 = 0]", + "//li[position() * 2 = 4]", + "//li[position() div 2 = 1]", + "//li[(position() mod 2) = 0]", + "//tr[number(td[2]) = 30]", + "//tr[number(td[2]) != 30]", + "//tr[number(td[2]) < 30]", + "//tr[number(td[2]) <= 30]", + "//tr[number(td[2]) > 30]", + "//tr[number(td[2]) >= 30]", + "//tr[td[2] = 30]", + "//tr[td[2] = '30']", + "//comment()", + ".//a[contains(normalize-space(string(.)), 'Click me')]", + ".//input[(./@type = 'text')]", + ".//*[@id='heading']", + ".//li[contains(concat(' ', @class, ' '), ' even ')]", + "//*[@id='heading']/text()", + "//em/parent::p", + "//p[em]", + "//p[not(em)]", + "//section[a/@href = '/foo']", + "//ul/li[last()][position() = last()]", + "//ul[string(count(li)) = '5']", + "//body[count(//*[contains(@class, 'item')]) = 5]", + }; + try testing.expectEqual(@as(usize, 91), battery.len); + + for (battery) |expr| { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + _ = parse(arena.allocator(), expr) catch |err| { + std.debug.print("\n failed to parse: {s}\n error: {s}\n", .{ expr, @errorName(err) }); + return err; + }; + } +} + +test "XPath.Parser: deep parenthesization rejected past max_depth" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + var buf: std.ArrayList(u8) = .empty; + defer buf.deinit(testing.allocator); + try buf.appendNTimes(testing.allocator, '(', max_depth + 1); + try buf.append(testing.allocator, '1'); + try buf.appendNTimes(testing.allocator, ')', max_depth + 1); + try testing.expectError(error.MaxDepthExceeded, parse(arena.allocator(), buf.items)); +} + +test "XPath.Parser: deep unary minus rejected past max_depth" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + var buf: std.ArrayList(u8) = .empty; + defer buf.deinit(testing.allocator); + try buf.appendNTimes(testing.allocator, '-', max_depth + 1); + try buf.append(testing.allocator, '1'); + try testing.expectError(error.MaxDepthExceeded, parse(arena.allocator(), buf.items)); +} diff --git a/src/browser/xpath/Tokenizer.zig b/src/browser/xpath/Tokenizer.zig new file mode 100644 index 00000000..6dd8d279 --- /dev/null +++ b/src/browser/xpath/Tokenizer.zig @@ -0,0 +1,464 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! XPath 1.0 expression tokenizer. +//! +//! HTML-pragmatic behavior: lenient whitespace, case-preserving names, +//! no escape processing in string literals (use the other quote type +//! to embed), unknown characters silently skipped. +//! +//! The tokenizer borrows from the input slice and never allocates. +//! `next()` always returns a token; `.eof` is terminal and idempotent. + +const std = @import("std"); + +const Tokenizer = @This(); + +pub const Token = union(enum) { + /// String literal: `'foo'` or `"foo"`. Quotes are stripped; escapes + /// are not interpreted (the polyfill takes the raw substring). + string: []const u8, + + /// Numeric literal: `123`, `1.5`, `.5`, `5.`. f64 matches the + /// runtime number type. + number: f64, + + /// Bare identifier — element/function/axis name, an `or`/`and`/ + /// `div`/`mod` keyword, or a namespace-prefixed name (`prefix:local`, + /// `prefix:*`). The colon and optional wildcard are preserved + /// verbatim so the parser can split. + name: []const u8, + + slash, // `/` + double_slash, // `//` + dot, // `.` + double_dot, // `..` + at, // `@` + lparen, // `(` + rparen, // `)` + lbracket, // `[` + rbracket, // `]` + comma, // `,` + pipe, // `|` + eq, // `=` + neq, // `!=` + lt, // `<` + lte, // `<=` + gt, // `>` + gte, // `>=` + plus, // `+` + minus, // `-` + star, // `*` + dollar, // `$` + double_colon, // `::` + eof, +}; + +input: []const u8, +position: usize = 0, + +fn isEof(self: *const Tokenizer) bool { + return self.position >= self.input.len; +} + +// True iff the input has at least `n` bytes left after the current one +// — i.e. `byteAt(n)` will not read past the end. +fn hasAtLeast(self: *const Tokenizer, n: usize) bool { + return self.position + n < self.input.len; +} + +fn byteAt(self: *const Tokenizer, offset: usize) u8 { + return self.input[self.position + offset]; +} + +fn skipWhitespace(self: *Tokenizer) void { + while (!self.isEof()) { + switch (self.input[self.position]) { + ' ', '\t', '\n', '\r' => self.position += 1, + else => return, + } + } +} + +fn isNameStart(c: u8) bool { + return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or c == '_'; +} + +fn isNameContinue(c: u8) bool { + return isNameStart(c) or std.ascii.isDigit(c) or c == '-' or c == '.'; +} + +fn consumeString(self: *Tokenizer, quote: u8) Token { + self.position += 1; // opening quote + const start = self.position; + while (!self.isEof() and self.input[self.position] != quote) { + self.position += 1; + } + const value = self.input[start..self.position]; + // Closing quote skipped; at EOF we just emit what we have (polyfill parity). + if (!self.isEof()) self.position += 1; + return .{ .string = value }; +} + +fn consumeNumber(self: *Tokenizer) Token { + const start = self.position; + while (!self.isEof() and std.ascii.isDigit(self.input[self.position])) { + self.position += 1; + } + if (!self.isEof() and self.input[self.position] == '.') { + self.position += 1; + while (!self.isEof() and std.ascii.isDigit(self.input[self.position])) { + self.position += 1; + } + } + // Caller only enters consumeNumber on a digit or `.digit`, so the + // slice is always `\d+(\.\d*)?` or `\.\d+` — both accepted by + // parseFloat (verified against Zig 0.15.2). + const value = std.fmt.parseFloat(f64, self.input[start..self.position]) catch unreachable; + return .{ .number = value }; +} + +fn consumeName(self: *Tokenizer) Token { + const start = self.position; + while (!self.isEof() and isNameContinue(self.input[self.position])) { + self.position += 1; + } + + // Optional namespace prefix: `prefix:local` or `prefix:*`. A `::` + // is the axis separator and belongs to the next token, so peek + // for a single `:` not followed by another `:`. + if (!self.isEof() and self.input[self.position] == ':' and + (self.position + 1 >= self.input.len or self.input[self.position + 1] != ':')) + { + self.position += 1; // `:` + if (!self.isEof() and self.input[self.position] == '*') { + self.position += 1; + } else { + while (!self.isEof() and isNameContinue(self.input[self.position])) { + self.position += 1; + } + } + } + + return .{ .name = self.input[start..self.position] }; +} + +pub fn next(self: *Tokenizer) Token { + while (true) { + self.skipWhitespace(); + if (self.isEof()) return .eof; + + const c = self.byteAt(0); + + if (c == '"' or c == '\'') { + return self.consumeString(c); + } + + if (std.ascii.isDigit(c) or (c == '.' and self.hasAtLeast(1) and std.ascii.isDigit(self.byteAt(1)))) { + return self.consumeNumber(); + } + + if (self.hasAtLeast(1)) { + const c2 = self.byteAt(1); + switch (c) { + '/' => if (c2 == '/') { + self.position += 2; + return .double_slash; + }, + ':' => if (c2 == ':') { + self.position += 2; + return .double_colon; + }, + '!' => if (c2 == '=') { + self.position += 2; + return .neq; + }, + '<' => if (c2 == '=') { + self.position += 2; + return .lte; + }, + '>' => if (c2 == '=') { + self.position += 2; + return .gte; + }, + '.' => if (c2 == '.') { + self.position += 2; + return .double_dot; + }, + else => {}, + } + } + + const single: ?Token = switch (c) { + '(' => .lparen, + ')' => .rparen, + '[' => .lbracket, + ']' => .rbracket, + ',' => .comma, + '|' => .pipe, + '=' => .eq, + '<' => .lt, + '>' => .gt, + '+' => .plus, + '-' => .minus, + '*' => .star, + '$' => .dollar, + '/' => .slash, + '@' => .at, + '.' => .dot, + else => null, + }; + if (single) |tok| { + self.position += 1; + return tok; + } + + if (isNameStart(c)) { + return self.consumeName(); + } + + // Polyfill parity (decision #2): unknown characters are + // silently skipped, never an error. + self.position += 1; + } +} + +const testing = std.testing; + +fn expectTokens(input: []const u8, expected: []const Token) !void { + var tokenizer = Tokenizer{ .input = input }; + for (expected) |exp| { + const got = tokenizer.next(); + try testing.expectEqualDeep(exp, got); + } +} + +test "XPath.Tokenizer: empty input emits EOF" { + try expectTokens("", &.{.eof}); +} + +test "XPath.Tokenizer: only whitespace emits EOF" { + try expectTokens(" \t\n\r ", &.{.eof}); +} + +test "XPath.Tokenizer: EOF idempotent past end" { + var t = Tokenizer{ .input = "" }; + try testing.expectEqual(Token.eof, t.next()); + try testing.expectEqual(Token.eof, t.next()); + try testing.expectEqual(Token.eof, t.next()); +} + +test "XPath.Tokenizer: single-char operators" { + try expectTokens("()[],|=<>+-*$/@.", &.{ + .lparen, .rparen, .lbracket, .rbracket, .comma, .pipe, + .eq, .lt, .gt, .plus, .minus, .star, + .dollar, .slash, .at, .dot, .eof, + }); +} + +test "XPath.Tokenizer: two-char operators" { + try expectTokens("// :: != <= >= ..", &.{ + .double_slash, .double_colon, .neq, .lte, .gte, .double_dot, .eof, + }); +} + +test "XPath.Tokenizer: two-char vs single-char disambiguation" { + try expectTokens("/a/b", &.{ + .slash, .{ .name = "a" }, .slash, .{ .name = "b" }, .eof, + }); + try expectTokens("//a", &.{ .double_slash, .{ .name = "a" }, .eof }); + try expectTokens("a +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! XPath 1.0 AST. +//! +//! Slices and pointers are arena-owned by the Parser; the AST has no +//! destructor. + +pub const Expr = union(enum) { + /// Absolute or relative location path: `/foo/bar`, `//x`, `foo/bar`. + path: Path, + /// Filter expression followed by a location-path tail: + /// `(//a)/b`, `(expr)//c`. + filter_path: FilterPath, + /// Filter expression with a single predicate: `(expr)[n]`. + /// Multi-predicate filters nest: `(e)[1][2]` → filter(filter(e,1),2). + filter: Filter, + binop: BinOp, + /// Unary minus. The polyfill has no unary `+`. + neg: *Expr, + /// String literal, quotes stripped. + literal: []const u8, + /// Numeric literal, parsed to f64. + number: f64, + /// Variable reference. The leading `$` is stripped; per decision #3 + /// the evaluator always returns the empty string. + var_ref: []const u8, + fn_call: FnCall, +}; + +pub const Path = struct { + absolute: bool, + steps: []const Step, +}; + +pub const FilterPath = struct { + filter: *Expr, + steps: []const Step, +}; + +pub const Filter = struct { + expr: *Expr, + predicate: *Expr, +}; + +pub const BinOp = struct { + op: BinOpKind, + left: *Expr, + right: *Expr, +}; + +pub const BinOpKind = enum { + or_, + and_, + eq, + neq, + lt, + gt, + lte, + gte, + add, + sub, + mul, + div, + mod, + union_, +}; + +pub const FnCall = struct { + name: []const u8, + args: []const *Expr, +}; + +pub const Step = struct { + axis: Axis, + node_test: NodeTest, + predicates: []const *Expr, +}; + +pub const Axis = enum { + child, + descendant, + descendant_or_self, + self, + parent, + ancestor, + ancestor_or_self, + following_sibling, + preceding_sibling, + following, + preceding, + attribute, + namespace, + /// Polyfill parity (decision #2): unknown axis names parse to + /// this variant; the evaluator returns an empty node-set. + unknown, +}; + +pub const NodeTest = union(enum) { + /// Element / attribute name. `"*"` is the wildcard. Namespaced forms + /// (`prefix:*`, `prefix:local`) are stored verbatim — the evaluator + /// does not split them, so they fall through to a literal `mem.eql` + /// against the node name (consistent with the `namespace::` axis stub + /// per decision #3). + /// TODO: real namespace support if the polyfill ever drops the stub. + name: []const u8, + /// `node()`, `text()`, `comment()`, `processing-instruction()`. + /// The optional target literal of `processing-instruction("foo")` + /// is consumed but not stored (decision #3 stub). + type_test: TypeTest, +}; + +pub const TypeTest = enum { + node, + text, + comment, + processing_instruction, +}; diff --git a/src/browser/xpath/functions.zig b/src/browser/xpath/functions.zig new file mode 100644 index 00000000..d8d42de4 --- /dev/null +++ b/src/browser/xpath/functions.zig @@ -0,0 +1,630 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! XPath 1.0 core function library — 25 functions covering the spec's +//! core function set. `position()` and `last()` live in +//! `Evaluator.evalFnCall` because they need the `(pos, size)` closure +//! that this module never sees. +//! +//! Args are pre-evaluated by the caller (`Evaluator.evalFnCall`). Eager +//! evaluation is fine here — short-circuit operators (`or`/`and`) are +//! binops, not function calls, so laziness isn't required. The +//! pre-evaluation contract also keeps functions.zig free of a circular +//! import on Evaluator.zig. +//! +//! Stubs per decision #3: +//! - `lang(string)` → always false +//! - `namespace-uri(...)` → always "" +//! - `name`/`local-name` → lowercased (HTML pragmatism) +//! +//! Allocations land in the caller's per-evaluation arena. + +const std = @import("std"); +const lp = @import("lightpanda"); + +const Node = @import("../webapi/Node.zig"); + +const result = @import("result.zig"); + +const Frame = lp.Frame; +const Element = Node.Element; +const Document = Node.Document; +const Allocator = std.mem.Allocator; + +pub const Error = error{ + OutOfMemory, + WriteFailed, + StringTooLarge, + UnknownFunction, +}; + +/// Dispatch a core-library function call. Returns `error.UnknownFunction` +/// if `name` doesn't match — the caller (Evaluator) handles +/// `position()` / `last()` inline before getting here, so this is the +/// last lookup stop. +pub fn call( + arena: Allocator, + name: []const u8, + args: []const result.Result, + ctx: *Node, + frame: *Frame, +) Error!result.Result { + // -- Node-set -- + if (eql(name, "count")) return .{ .number = countFn(args) }; + if (eql(name, "id")) return idFn(arena, args, ctx, frame); + if (eql(name, "local-name")) return .{ .string = try localNameFn(arena, args, ctx) }; + if (eql(name, "name")) return .{ .string = try nameFn(arena, args, ctx) }; + if (eql(name, "namespace-uri")) return .{ .string = "" }; + + // -- String -- + if (eql(name, "string")) return .{ .string = try stringFn(arena, args, ctx) }; + if (eql(name, "concat")) return .{ .string = try concatFn(arena, args) }; + if (eql(name, "starts-with")) return .{ .boolean = try startsWithFn(arena, args) }; + if (eql(name, "contains")) return .{ .boolean = try containsFn(arena, args) }; + if (eql(name, "substring-before")) return .{ .string = try substringBeforeFn(arena, args) }; + if (eql(name, "substring-after")) return .{ .string = try substringAfterFn(arena, args) }; + if (eql(name, "substring")) return .{ .string = try substringFn(arena, args) }; + if (eql(name, "string-length")) return .{ .number = try stringLengthFn(arena, args, ctx) }; + if (eql(name, "normalize-space")) return .{ .string = try normalizeSpaceFn(arena, args, ctx) }; + if (eql(name, "translate")) return .{ .string = try translateFn(arena, args) }; + + // -- Boolean -- + if (eql(name, "boolean")) return .{ .boolean = if (args.len == 0) false else result.toBoolean(args[0]) }; + if (eql(name, "not")) return .{ .boolean = if (args.len == 0) true else !result.toBoolean(args[0]) }; + if (eql(name, "true")) return .{ .boolean = true }; + if (eql(name, "false")) return .{ .boolean = false }; + if (eql(name, "lang")) return .{ .boolean = false }; + + // -- Number -- + if (eql(name, "number")) return .{ .number = try numberFn(arena, args, ctx) }; + if (eql(name, "sum")) return .{ .number = try sumFn(arena, args) }; + if (eql(name, "floor")) return .{ .number = if (args.len == 0) std.math.nan(f64) else std.math.floor(try result.toNumber(arena, args[0])) }; + if (eql(name, "ceiling")) return .{ .number = if (args.len == 0) std.math.nan(f64) else std.math.ceil(try result.toNumber(arena, args[0])) }; + if (eql(name, "round")) return .{ .number = if (args.len == 0) std.math.nan(f64) else roundHalfToPosInf(try result.toNumber(arena, args[0])) }; + + return error.UnknownFunction; +} + +inline fn eql(a: []const u8, b: []const u8) bool { + return std.mem.eql(u8, a, b); +} + +// ----- node-set fns ----- + +fn countFn(args: []const result.Result) f64 { + if (args.len == 0 or args[0] != .node_set) return 0; + return @floatFromInt(args[0].node_set.len); +} + +fn idFn(arena: Allocator, args: []const result.Result, ctx: *Node, frame: *Frame) Error!result.Result { + if (args.len == 0) return .{ .node_set = &.{} }; + + // Polyfill: node-set arg → join `stringVal(n)` of each by ' '. Scalar + // arg → `toStr`. Then split on whitespace and look up each token. + const id_str: []const u8 = blk: { + if (args[0] == .node_set) { + var buf = std.Io.Writer.Allocating.init(arena); + for (args[0].node_set, 0..) |n, i| { + if (i > 0) try buf.writer.writeByte(' '); + const sv = try result.stringValueOf(arena, n); + try buf.writer.writeAll(sv); + } + break :blk buf.written(); + } + break :blk try result.toString(arena, args[0]); + }; + + // `ctx.ownerDocument || ctx` — document nodes own themselves. + const doc = ctx.ownerDocument(frame) orelse (ctx.is(Document) orelse return .{ .node_set = &.{} }); + + var seen: std.AutoArrayHashMapUnmanaged(*Node, void) = .empty; + var it = std.mem.tokenizeAny(u8, id_str, &std.ascii.whitespace); + while (it.next()) |tok| { + if (doc.getElementById(tok, frame)) |el| { + try seen.put(arena, el.asNode(), {}); + } + } + return .{ .node_set = seen.keys() }; +} + +fn localNameFn(arena: Allocator, args: []const result.Result, ctx: *Node) Error![]const u8 { + const node = firstNodeOrCtx(args, ctx) orelse return ""; + // For Element, `getLocalName` returns a slice into `_tag_name` + // (lowercase, namespace-prefix stripped) — lifetime exceeds the + // per-evaluation arena, so we borrow instead of duping. + if (node.is(Element)) |el| return el.getLocalName(); + var buf: [256]u8 = undefined; + return std.ascii.allocLowerString(arena, node.getNodeName(&buf)); +} + +fn nameFn(arena: Allocator, args: []const result.Result, ctx: *Node) Error![]const u8 { + const node = firstNodeOrCtx(args, ctx) orelse return ""; + // Diverges from `local-name` only on namespaced elements: `name` + // keeps the prefix (`ns:foo`), `local-name` strips it (`foo`). + if (node.is(Element)) |el| return el.getTagNameLower(); + var buf: [256]u8 = undefined; + return std.ascii.allocLowerString(arena, node.getNodeName(&buf)); +} + +fn firstNodeOrCtx(args: []const result.Result, ctx: *Node) ?*Node { + if (args.len == 0) return ctx; + if (args[0] != .node_set) return null; + if (args[0].node_set.len == 0) return null; + return args[0].node_set[0]; +} + +// ----- string fns ----- + +fn stringFn(arena: Allocator, args: []const result.Result, ctx: *Node) Error![]const u8 { + if (args.len == 0) return try result.stringValueOf(arena, ctx); + return try result.toString(arena, args[0]); +} + +fn concatFn(arena: Allocator, args: []const result.Result) Error![]const u8 { + var buf = std.Io.Writer.Allocating.init(arena); + for (args) |a| { + const s = try result.toString(arena, a); + try buf.writer.writeAll(s); + } + return buf.written(); +} + +fn startsWithFn(arena: Allocator, args: []const result.Result) Error!bool { + if (args.len < 2) return false; + const s1 = try result.toString(arena, args[0]); + const s2 = try result.toString(arena, args[1]); + return std.mem.startsWith(u8, s1, s2); +} + +fn containsFn(arena: Allocator, args: []const result.Result) Error!bool { + if (args.len < 2) return false; + const s1 = try result.toString(arena, args[0]); + const s2 = try result.toString(arena, args[1]); + return std.mem.indexOf(u8, s1, s2) != null; +} + +fn substringBeforeFn(arena: Allocator, args: []const result.Result) Error![]const u8 { + if (args.len < 2) return ""; + const s1 = try result.toString(arena, args[0]); + const s2 = try result.toString(arena, args[1]); + if (std.mem.indexOf(u8, s1, s2)) |idx| { + return s1[0..idx]; + } + return ""; +} + +fn substringAfterFn(arena: Allocator, args: []const result.Result) Error![]const u8 { + if (args.len < 2) return ""; + const s1 = try result.toString(arena, args[0]); + const s2 = try result.toString(arena, args[1]); + if (std.mem.indexOf(u8, s1, s2)) |idx| { + return s1[idx + s2.len ..]; + } + return ""; +} + +fn substringFn(arena: Allocator, args: []const result.Result) Error![]const u8 { + if (args.len < 2) return ""; + const s = try result.toString(arena, args[0]); + const start_raw = try result.toNumber(arena, args[1]); + if (std.math.isNan(start_raw)) return ""; + const start = roundHalfToPosInf(start_raw); + + const s_len: f64 = @floatFromInt(s.len); + if (args.len >= 3) { + const len_raw = try result.toNumber(arena, args[2]); + if (std.math.isNan(len_raw)) return ""; + const len = roundHalfToPosInf(len_raw); + const sum = start - 1 + len; + // -inf + inf is NaN; @intFromFloat(NaN) is illegal behavior. + if (std.math.isNan(sum)) return ""; + const si_f = @max(start - 1, 0); + const ei_f = @min(sum, s_len); + if (si_f >= ei_f) return ""; + const si: usize = @intFromFloat(si_f); + const ei: usize = @intFromFloat(ei_f); + return s[si..ei]; + } + + const si_f = @max(start - 1, 0); + if (si_f >= s_len) return ""; + const si: usize = @intFromFloat(si_f); + return s[si..]; +} + +fn stringLengthFn(arena: Allocator, args: []const result.Result, ctx: *Node) Error!f64 { + const s = if (args.len == 0) + try result.stringValueOf(arena, ctx) + else + try result.toString(arena, args[0]); + // Polyfill returns UTF-16 code units; we return UTF-8 bytes. They + // agree on ASCII (the gem's 91-case battery is ASCII-only). See + // .claude/skills/xpath-port/NOTES.md for the divergence rationale. + return @floatFromInt(s.len); +} + +fn normalizeSpaceFn(arena: Allocator, args: []const result.Result, ctx: *Node) Error![]const u8 { + const s = if (args.len == 0) + try result.stringValueOf(arena, ctx) + else + try result.toString(arena, args[0]); + + const trimmed = std.mem.trim(u8, s, &std.ascii.whitespace); + if (trimmed.len == 0) return ""; + + var buf = std.Io.Writer.Allocating.init(arena); + var prev_space = false; + for (trimmed) |c| { + if (std.ascii.isWhitespace(c)) { + if (!prev_space) try buf.writer.writeByte(' '); + prev_space = true; + } else { + try buf.writer.writeByte(c); + prev_space = false; + } + } + return buf.written(); +} + +fn translateFn(arena: Allocator, args: []const result.Result) Error![]const u8 { + if (args.len < 3) return ""; + const s = try result.toString(arena, args[0]); + const from = try result.toString(arena, args[1]); + const to = try result.toString(arena, args[2]); + + var buf = std.Io.Writer.Allocating.init(arena); + for (s) |c| { + if (std.mem.indexOfScalar(u8, from, c)) |idx| { + // Chars in `from` past `to.len` are deleted (no copy). + if (idx < to.len) try buf.writer.writeByte(to[idx]); + } else { + try buf.writer.writeByte(c); + } + } + return buf.written(); +} + +// ----- number fns ----- + +fn numberFn(arena: Allocator, args: []const result.Result, ctx: *Node) Error!f64 { + if (args.len == 0) { + const sv = try result.stringValueOf(arena, ctx); + return result.stringToNumber(sv); + } + return try result.toNumber(arena, args[0]); +} + +fn sumFn(arena: Allocator, args: []const result.Result) Error!f64 { + if (args.len == 0 or args[0] != .node_set) return std.math.nan(f64); + var total: f64 = 0; + for (args[0].node_set) |n| { + const sv = try result.stringValueOf(arena, n); + total += result.stringToNumber(sv); + } + return total; +} + +/// Round half toward positive infinity. Matches JS `Math.round` (the +/// polyfill calls it for both `round()` and `substring()`): +/// round(0.5) = 1 round(-0.5) = 0 +/// round(1.5) = 2 round(-1.5) = -1 +/// Diverges from Zig's `@round` (away from zero): `@round(-0.5) = -1`. +fn roundHalfToPosInf(n: f64) f64 { + if (std.math.isNan(n) or !std.math.isFinite(n)) return n; + return std.math.floor(n + 0.5); +} + +// --------------------------------------------------------------------- +// Tests — pure-logic only. Functions that need a real DOM (id, name, +// local-name, string with element ctx, sum, count of node-set, etc.) +// are exercised via Phase 9 HTML fixtures in tests/xpath/. +// --------------------------------------------------------------------- + +const testing = std.testing; +const Tokenizer = @import("Tokenizer.zig"); +const Parser = @import("Parser.zig"); +const Evaluator = @import("Evaluator.zig"); + +fn evalScalar(a: Allocator, src: []const u8) !result.Result { + const expr = try Parser.parse(a, src); + // Synthetic Frame/Node pointers — the public `evaluate` entry only + // touches the Frame for path/axis evaluation. Pure-scalar expressions + // (arithmetic, function calls returning scalars) never deref it. + return Evaluator.evaluate(a, expr, @ptrFromInt(0x2000), @ptrFromInt(0x1000)); +} + +test "Functions: count() of non-node-set returns 0" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const r = try evalScalar(arena.allocator(), "count('hello')"); + try testing.expect(r == .number); + try testing.expectEqual(@as(f64, 0), r.number); +} + +test "Functions: string() on scalar coerces" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + .{ "string(42)", "42" }, + .{ "string(3.14)", "3.14" }, + .{ "string(true())", "true" }, + .{ "string(false())", "false" }, + .{ "string('hello')", "hello" }, + .{ "string(0)", "0" }, + .{ "string(-1)", "-1" }, + }) |case| { + const r = try evalScalar(a, case[0]); + try testing.expect(r == .string); + try testing.expectEqualStrings(case[1], r.string); + } +} + +test "Functions: concat() variadic" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + .{ "concat('a', 'b')", "ab" }, + .{ "concat('a', 'b', 'c')", "abc" }, + .{ "concat('foo', '-', 'bar', '-', 'baz')", "foo-bar-baz" }, + .{ "concat('x', 1, 'y')", "x1y" }, + }) |case| { + const r = try evalScalar(a, case[0]); + try testing.expect(r == .string); + try testing.expectEqualStrings(case[1], r.string); + } +} + +test "Functions: starts-with / contains" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + .{ "starts-with('hello', 'he')", true }, + .{ "starts-with('hello', 'el')", false }, + .{ "starts-with('hello', '')", true }, + .{ "contains('hello world', 'wor')", true }, + .{ "contains('hello', 'xyz')", false }, + .{ "contains('hello', '')", true }, + }) |case| { + const r = try evalScalar(a, case[0]); + try testing.expect(r == .boolean); + try testing.expectEqual(case[1], r.boolean); + } +} + +test "Functions: substring-before / substring-after" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + .{ "substring-before('1999/04/01', '/')", "1999" }, + .{ "substring-before('hello', 'xyz')", "" }, + .{ "substring-after('1999/04/01', '/')", "04/01" }, + .{ "substring-after('hello', 'xyz')", "" }, + .{ "substring-after('hello', '')", "hello" }, + }) |case| { + const r = try evalScalar(a, case[0]); + try testing.expect(r == .string); + try testing.expectEqualStrings(case[1], r.string); + } +} + +test "Functions: substring() — XPath 1-based, rounding, NaN handling" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + .{ "substring('12345', 2, 3)", "234" }, + .{ "substring('12345', 2)", "2345" }, + // XPath spec example: round(1.5) = 2 → start at pos 2, len 2. + .{ "substring('12345', 1.5, 2.6)", "234" }, + // start = 0: si = max(-1, 0) = 0, ei = min(0 - 1 + 3, len) = 2. + .{ "substring('12345', 0, 3)", "12" }, + // Negative start clamps to 0. + .{ "substring('12345', -3, 7)", "123" }, + // NaN start. + .{ "substring('12345', 'foo')", "" }, + // NaN length. + .{ "substring('12345', 1, 'foo')", "" }, + }) |case| { + const r = try evalScalar(a, case[0]); + try testing.expect(r == .string); + try testing.expectEqualStrings(case[1], r.string); + } +} + +test "Functions: string-length on scalar arg" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + .{ "string-length('hello')", 5 }, + .{ "string-length('')", 0 }, + .{ "string-length('a b c')", 5 }, + }) |case| { + const r = try evalScalar(a, case[0]); + try testing.expect(r == .number); + try testing.expectEqual(@as(f64, case[1]), r.number); + } +} + +test "Functions: normalize-space" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + .{ "normalize-space(' hello world ')", "hello world" }, + .{ "normalize-space('hello')", "hello" }, + .{ "normalize-space('')", "" }, + .{ "normalize-space(' ')", "" }, + .{ "normalize-space('a\tb\nc')", "a b c" }, + }) |case| { + const r = try evalScalar(a, case[0]); + try testing.expect(r == .string); + try testing.expectEqualStrings(case[1], r.string); + } +} + +test "Functions: translate" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + // Standard XPath spec example. + .{ "translate('bar', 'abc', 'ABC')", "BAr" }, + // Char in `from` past `to.len` is deleted. + .{ "translate('--aaa--', 'abc-', 'ABC')", "AAA" }, + .{ "translate('hello', '', '')", "hello" }, + // Identity. + .{ "translate('abc', 'abc', 'abc')", "abc" }, + }) |case| { + const r = try evalScalar(a, case[0]); + try testing.expect(r == .string); + try testing.expectEqualStrings(case[1], r.string); + } +} + +test "Functions: boolean / not / true / false / lang" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + .{ "true()", true }, + .{ "false()", false }, + .{ "not(true())", false }, + .{ "not(false())", true }, + .{ "boolean(1)", true }, + .{ "boolean(0)", false }, + .{ "boolean('')", false }, + .{ "boolean('x')", true }, + // lang is a stub — always false. + .{ "lang('en')", false }, + }) |case| { + const r = try evalScalar(a, case[0]); + try testing.expect(r == .boolean); + try testing.expectEqual(case[1], r.boolean); + } +} + +test "Functions: number() on scalar arg" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + { + const r = try evalScalar(a, "number('42')"); + try testing.expectEqual(@as(f64, 42), r.number); + } + { + const r = try evalScalar(a, "number(true())"); + try testing.expectEqual(@as(f64, 1), r.number); + } + { + const r = try evalScalar(a, "number(false())"); + try testing.expectEqual(@as(f64, 0), r.number); + } + { + const r = try evalScalar(a, "number('foo')"); + try testing.expect(std.math.isNan(r.number)); + } +} + +test "Functions: floor / ceiling / round" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + inline for (.{ + .{ "floor(1.5)", 1 }, + .{ "floor(-1.5)", -2 }, + .{ "floor(0)", 0 }, + .{ "ceiling(1.5)", 2 }, + .{ "ceiling(-1.5)", -1 }, + .{ "ceiling(0)", 0 }, + // Half-toward-positive-infinity (JS Math.round behavior). + .{ "round(0.5)", 1 }, + .{ "round(-0.5)", 0 }, + .{ "round(1.5)", 2 }, + .{ "round(-1.5)", -1 }, + .{ "round(2.5)", 3 }, + }) |case| { + const r = try evalScalar(a, case[0]); + try testing.expect(r == .number); + try testing.expectEqual(@as(f64, case[1]), r.number); + } +} + +test "Functions: round/floor/ceiling propagate NaN and Infinity" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + { + const r = try evalScalar(a, "round(1 div 0)"); // +Infinity + try testing.expect(std.math.isPositiveInf(r.number)); + } + { + const r = try evalScalar(a, "round(0 div 0)"); // NaN + try testing.expect(std.math.isNan(r.number)); + } + { + const r = try evalScalar(a, "floor(0 div 0)"); + try testing.expect(std.math.isNan(r.number)); + } + { + const r = try evalScalar(a, "ceiling(0 div 0)"); + try testing.expect(std.math.isNan(r.number)); + } +} + +test "Functions: sum / count on non-node-set defaults" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + + { + const r = try evalScalar(a, "sum('hello')"); + try testing.expect(std.math.isNan(r.number)); + } + { + const r = try evalScalar(a, "count('hello')"); + try testing.expectEqual(@as(f64, 0), r.number); + } +} + +test "Functions: roundHalfToPosInf" { + try testing.expectEqual(@as(f64, 1), roundHalfToPosInf(0.5)); + try testing.expectEqual(@as(f64, 0), roundHalfToPosInf(-0.5)); + try testing.expectEqual(@as(f64, 2), roundHalfToPosInf(1.5)); + try testing.expectEqual(@as(f64, -1), roundHalfToPosInf(-1.5)); + try testing.expectEqual(@as(f64, 3), roundHalfToPosInf(2.5)); + try testing.expect(std.math.isNan(roundHalfToPosInf(std.math.nan(f64)))); + try testing.expect(std.math.isPositiveInf(roundHalfToPosInf(std.math.inf(f64)))); + try testing.expect(std.math.isNegativeInf(roundHalfToPosInf(-std.math.inf(f64)))); +} diff --git a/src/browser/xpath/result.zig b/src/browser/xpath/result.zig new file mode 100644 index 00000000..0556f4ee --- /dev/null +++ b/src/browser/xpath/result.zig @@ -0,0 +1,199 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! XPath 1.0 runtime values. +//! +//! Tagged union over the four XPath value types: node-set, number, +//! string, boolean. Type coercion (`toString`, `toNumber`, `toBoolean`) +//! follows XPath 1.0 spec §3, with HTML-pragmatic shortcuts (decision +//! #2). + +const std = @import("std"); + +const Node = @import("../webapi/Node.zig"); + +const CData = Node.CData; +const Allocator = std.mem.Allocator; + +pub const Result = union(enum) { + /// Owned by the evaluator's arena. Order is significant only at the + /// public boundary, where the evaluator sorts to document order. + node_set: []const *Node, + number: f64, + string: []const u8, + boolean: bool, +}; + +/// XPath spec §5: string-value of a node. +/// +/// - Element / Document: concatenated text descendants (excluding +/// comments and processing-instructions; matches `Node.getTextContent`) +/// - Attribute: attribute value +/// - Text / Comment / CDATA / PI: the node's data +/// - DocumentType / DocumentFragment: empty (matches polyfill's +/// `nodeValue || textContent || ''` fallthrough) +/// +/// The returned slice is borrowed from the node for cdata/attribute +/// (cheap, no allocation) and arena-allocated for element/document +/// (concatenation buffer). +pub fn stringValueOf(arena: Allocator, node: *Node) error{WriteFailed}![]const u8 { + return switch (node._type) { + .attribute => |attr| attr._value.str(), + .cdata => |cd| cd._data.str(), + .element, .document => blk: { + var buf = std.Io.Writer.Allocating.init(arena); + try node.getTextContent(&buf.writer); + break :blk buf.written(); + }, + .document_type, .document_fragment => "", + }; +} + +pub fn toBoolean(val: Result) bool { + return switch (val) { + .boolean => |b| b, + .number => |n| n != 0 and !std.math.isNan(n), + .string => |s| s.len > 0, + .node_set => |ns| ns.len > 0, + }; +} + +/// Numeric coercion. Empty / whitespace-only strings produce NaN +/// (XPath spec §4.4 — matches JS `Number(' ') === 0` *not* applying +/// because the polyfill calls `s.trim() === '' ? NaN : Number(s)`). +pub fn toNumber(arena: Allocator, val: Result) error{WriteFailed}!f64 { + return switch (val) { + .number => |n| n, + .boolean => |b| if (b) 1 else 0, + .string => |s| stringToNumber(s), + .node_set => |ns| blk: { + if (ns.len == 0) break :blk std.math.nan(f64); + const sv = try stringValueOf(arena, ns[0]); + break :blk stringToNumber(sv); + }, + }; +} + +pub fn stringToNumber(s: []const u8) f64 { + const trimmed = std.mem.trim(u8, s, &std.ascii.whitespace); + if (trimmed.len == 0) return std.math.nan(f64); + return std.fmt.parseFloat(f64, trimmed) catch std.math.nan(f64); +} + +/// String coercion. Allocates only for `.number` (formatting) and for +/// `.node_set` whose first node is an Element/Document (text content +/// concatenation). Boolean → static string. String → borrowed. +pub fn toString(arena: Allocator, val: Result) error{ OutOfMemory, WriteFailed }![]const u8 { + return switch (val) { + .string => |s| s, + .boolean => |b| if (b) "true" else "false", + .number => |n| try numberToString(arena, n), + .node_set => |ns| if (ns.len == 0) "" else try stringValueOf(arena, ns[0]), + }; +} + +/// XPath spec §4.2: NaN, ±0, and ±Infinity have specific spellings; +/// integer-valued numbers print without trailing `.0`. Diverges from +/// Zig's default `{d}` which prints `nan`/`inf` and may emit `-0`. +pub fn numberToString(arena: Allocator, n: f64) error{OutOfMemory}![]const u8 { + if (std.math.isNan(n)) return "NaN"; + if (std.math.isPositiveInf(n)) return "Infinity"; + if (std.math.isNegativeInf(n)) return "-Infinity"; + if (n == 0) return "0"; // covers +0 and -0 + if (@trunc(n) == n and n >= -9.007199254740992e15 and n <= 9.007199254740992e15) { + return std.fmt.allocPrint(arena, "{d}", .{@as(i64, @intFromFloat(n))}); + } + return std.fmt.allocPrint(arena, "{d}", .{n}); +} + +const testing = std.testing; + +test "Result: toBoolean" { + try testing.expect(toBoolean(.{ .boolean = true })); + try testing.expect(!toBoolean(.{ .boolean = false })); + try testing.expect(toBoolean(.{ .number = 1 })); + try testing.expect(!toBoolean(.{ .number = 0 })); + try testing.expect(!toBoolean(.{ .number = std.math.nan(f64) })); + try testing.expect(toBoolean(.{ .string = "x" })); + try testing.expect(!toBoolean(.{ .string = "" })); + try testing.expect(!toBoolean(.{ .node_set = &.{} })); +} + +test "Result: stringToNumber" { + try testing.expectEqual(@as(f64, 42), stringToNumber("42")); + try testing.expectEqual(@as(f64, 3.14), stringToNumber("3.14")); + try testing.expectEqual(@as(f64, -1), stringToNumber("-1")); + try testing.expectEqual(@as(f64, 5), stringToNumber(" 5 ")); + try testing.expect(std.math.isNan(stringToNumber(""))); + try testing.expect(std.math.isNan(stringToNumber(" "))); + try testing.expect(std.math.isNan(stringToNumber("abc"))); +} + +test "Result: numberToString — integers print without decimal" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + try testing.expectEqualStrings("5", try numberToString(a, 5)); + try testing.expectEqualStrings("0", try numberToString(a, 0)); + try testing.expectEqualStrings("0", try numberToString(a, -0.0)); + try testing.expectEqualStrings("-1", try numberToString(a, -1)); + try testing.expectEqualStrings("42", try numberToString(a, 42.0)); +} + +test "Result: numberToString — special values" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + try testing.expectEqualStrings("NaN", try numberToString(a, std.math.nan(f64))); + try testing.expectEqualStrings("Infinity", try numberToString(a, std.math.inf(f64))); + try testing.expectEqualStrings("-Infinity", try numberToString(a, -std.math.inf(f64))); +} + +test "Result: numberToString — floats" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const a = arena.allocator(); + try testing.expectEqualStrings("3.14", try numberToString(a, 3.14)); + try testing.expectEqualStrings("0.5", try numberToString(a, 0.5)); +} + +test "Result: toString — boolean returns static string" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + try testing.expectEqualStrings("true", try toString(arena.allocator(), .{ .boolean = true })); + try testing.expectEqualStrings("false", try toString(arena.allocator(), .{ .boolean = false })); +} + +test "Result: toString — node-set with empty arr is empty" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + try testing.expectEqualStrings("", try toString(arena.allocator(), .{ .node_set = &.{} })); +} + +test "Result: toNumber — empty node-set is NaN" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + try testing.expect(std.math.isNan(try toNumber(arena.allocator(), .{ .node_set = &.{} }))); +} + +test "Result: toNumber — boolean coerces to 0/1" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + try testing.expectEqual(@as(f64, 1), try toNumber(arena.allocator(), .{ .boolean = true })); + try testing.expectEqual(@as(f64, 0), try toNumber(arena.allocator(), .{ .boolean = false })); +} diff --git a/src/cdp/domains/dom.zig b/src/cdp/domains/dom.zig index de7712e7..aa7cfb2a 100644 --- a/src/cdp/domains/dom.zig +++ b/src/cdp/domains/dom.zig @@ -27,6 +27,7 @@ const dump = @import("../../browser/dump.zig"); const js = @import("../../browser/js/js.zig"); const DOMNode = @import("../../browser/webapi/Node.zig"); const Selector = @import("../../browser/webapi/selector/Selector.zig"); +const xpath = @import("../../browser/xpath/Evaluator.zig"); const log = lp.log; const Allocator = std.mem.Allocator; @@ -91,6 +92,56 @@ fn getDocument(cmd: *CDP.Command) !void { return cmd.sendResult(.{ .root = bc.nodeWriter(node, .{ .depth = params.depth }) }, .{}); } +// Closed set of XPath 1.0 named axes. Matched literally before `::` so +// CSS pseudo-elements (`a::before`, `div::first-line`) don't get +// misrouted to the XPath evaluator just because they have an +// identifier-looking word before `::`. +const xpath_axis_names = std.StaticStringMap(void).initComptime(.{ + .{ "child", {} }, + .{ "descendant", {} }, + .{ "descendant-or-self", {} }, + .{ "self", {} }, + .{ "parent", {} }, + .{ "ancestor", {} }, + .{ "ancestor-or-self", {} }, + .{ "following-sibling", {} }, + .{ "preceding-sibling", {} }, + .{ "following", {} }, + .{ "preceding", {} }, + .{ "attribute", {} }, + .{ "namespace", {} }, +}); + +// Heuristic (decision #2/#9): treat the query as XPath when it begins +// with a path operator or contains an axis specifier; otherwise fall +// through to CSS. +fn isXPathQuery(q: []const u8) bool { + if (q.len == 0) return false; + if (q[0] == '/') return true; + if (q[0] == '.' and q.len > 1 and q[1] == '/') return true; + if (q[0] == '(' and q.len > 1) { + if (q[1] == '/') return true; + if (q[1] == '.' and q.len > 2 and q[2] == '/') return true; + } + // For `::` to be an XPath axis separator, the identifier immediately + // before it must be one of the 13 named axes. Walk back the run of + // [a-zA-Z-] characters and look it up in the closed set. + var idx: usize = 0; + while (std.mem.indexOfPos(u8, q, idx, "::")) |hit| : (idx = hit + 1) { + if (hit == 0) continue; + var start = hit; + while (start > 0) { + const c = q[start - 1]; + const is_axis_char = (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or c == '-'; + if (!is_axis_char) break; + start -= 1; + } + if (start == hit) continue; + if (xpath_axis_names.has(q[start..hit])) return true; + } + return false; +} + // https://chromedevtools.github.io/devtools-protocol/tot/DOM/#method-performSearch fn performSearch(cmd: *CDP.Command) !void { const params = (try cmd.params(struct { @@ -100,15 +151,23 @@ fn performSearch(cmd: *CDP.Command) !void { const bc = cmd.browser_context orelse return error.BrowserContextNotLoaded; const frame = bc.session.currentFrame() orelse return error.FrameNotLoaded; - const list = try Selector.querySelectorAll(frame.window._document.asNode(), params.query, frame); + const root = frame.window._document.asNode(); + + if (isXPathQuery(params.query)) { + const arena = try frame.getArena(.medium, "DOM.performSearch"); + defer frame.releaseArena(arena); + const nodes = try xpath.searchAll(arena, root, params.query, frame); + return finishSearch(cmd, bc, nodes); + } + + const list = try Selector.querySelectorAll(root, params.query, frame); defer list.deinit(frame._page); + return finishSearch(cmd, bc, list._nodes); +} - const search = try bc.node_search_list.create(list._nodes); - - // dispatch setChildNodesEvents to inform the client of the subpart of node - // tree covering the results. - try dispatchSetChildNodes(cmd, list._nodes); - +fn finishSearch(cmd: *CDP.Command, bc: *CDP.BrowserContext, nodes: []const *DOMNode) !void { + const search = try bc.node_search_list.create(nodes); + try dispatchSetChildNodes(cmd, nodes); return cmd.sendResult(.{ .searchId = search.name, .resultCount = @as(u32, @intCast(search.node_ids.len)), @@ -616,6 +675,78 @@ test "cdp.dom: search flow" { try ctx.expectSentError(-31998, "SearchResultNotFound", .{ .id = 17 }); } +test "cdp.dom: performSearch with XPath" { + var ctx = try testing.context(); + defer ctx.deinit(); + + _ = try ctx.loadBrowserContext(.{ .id = "BID-A", .url = "cdp/perform_search_xpath.html" }); + + try ctx.processMessage(.{ + .id = 20, + .method = "DOM.performSearch", + .params = .{ .query = "//p" }, + }); + try ctx.expectSentResult(.{ .searchId = "0", .resultCount = 3 }, .{ .id = 20 }); + + try ctx.processMessage(.{ + .id = 21, + .method = "DOM.performSearch", + .params = .{ .query = "descendant::p" }, + }); + try ctx.expectSentResult(.{ .searchId = "1", .resultCount = 3 }, .{ .id = 21 }); + + try ctx.processMessage(.{ + .id = 22, + .method = "DOM.performSearch", + .params = .{ .query = "//*[@id='outer']" }, + }); + try ctx.expectSentResult(.{ .searchId = "2", .resultCount = 1 }, .{ .id = 22 }); + + try ctx.processMessage(.{ + .id = 23, + .method = "DOM.performSearch", + .params = .{ .query = "p" }, + }); + try ctx.expectSentResult(.{ .searchId = "3", .resultCount = 3 }, .{ .id = 23 }); + + try ctx.processMessage(.{ + .id = 24, + .method = "DOM.performSearch", + .params = .{ .query = "div p" }, + }); + try ctx.expectSentResult(.{ .searchId = "4", .resultCount = 2 }, .{ .id = 24 }); +} + +test "cdp.dom: isXPathQuery heuristic" { + // XPath-shaped queries — each line covers a distinct heuristic branch. + try std.testing.expect(isXPathQuery("/html")); + try std.testing.expect(isXPathQuery("//p")); + try std.testing.expect(isXPathQuery(".//foo")); + try std.testing.expect(isXPathQuery("(//foo)[1]")); + try std.testing.expect(isXPathQuery("(./bar)[2]")); + try std.testing.expect(isXPathQuery("descendant::p")); + try std.testing.expect(isXPathQuery("ancestor-or-self::*")); + try std.testing.expect(isXPathQuery("//*[@id='x']")); + + // CSS-shaped queries — fall through to the existing path. + try std.testing.expect(!isXPathQuery("")); + try std.testing.expect(!isXPathQuery("p")); + try std.testing.expect(!isXPathQuery("div p")); + try std.testing.expect(!isXPathQuery("#main")); + try std.testing.expect(!isXPathQuery(".cls")); + try std.testing.expect(!isXPathQuery("[data-x]")); + try std.testing.expect(!isXPathQuery("(p)")); // parens without path → CSS + try std.testing.expect(!isXPathQuery(".x")); // leading dot without / + + // CSS pseudo-elements: identifier before `::` is not an XPath axis name. + try std.testing.expect(!isXPathQuery("a::before")); + try std.testing.expect(!isXPathQuery("div::after")); + try std.testing.expect(!isXPathQuery("p::first-line")); + try std.testing.expect(!isXPathQuery("input::placeholder")); + // Attribute selector with `::` inside a literal — nothing axis-like before it. + try std.testing.expect(!isXPathQuery("[data-x=\"x::y\"]")); +} + test "cdp.dom: querySelector unknown search id" { var ctx = try testing.context(); defer ctx.deinit(); diff --git a/src/lightpanda.zig b/src/lightpanda.zig index b60292c9..fed1bf41 100644 --- a/src/lightpanda.zig +++ b/src/lightpanda.zig @@ -58,6 +58,7 @@ pub const FetchOpts = struct { wait_ms: u32 = 5000, wait_until: ?Config.WaitUntil = null, wait_script: ?[:0]const u8 = null, + inject_script: std.ArrayList([]const u8) = .{}, wait_selector: ?[:0]const u8 = null, dump: dump.Opts, dump_mode: ?Config.DumpFormat = null, @@ -79,6 +80,9 @@ pub fn fetch(app: *App, browser: *Browser, url: [:0]const u8, opts: FetchOpts) ! } } + // Stash scripts user want to inject. + session.inject_scripts = opts.inject_script.items; + const frame = try session.createPage(); // // Comment this out to get a profile of the JS code in v8/profile.json. diff --git a/src/main.zig b/src/main.zig index 499f2f2c..6d4bd249 100644 --- a/src/main.zig +++ b/src/main.zig @@ -128,6 +128,7 @@ fn run(allocator: Allocator, main_arena: Allocator) !void { .wait_ms = opts.wait_ms, .wait_until = opts.wait_until, .wait_script = opts.wait_script, + .inject_script = opts.inject_script, .wait_selector = opts.wait_selector, .dump_mode = opts.dump, .dump = .{ diff --git a/src/testing.zig b/src/testing.zig index 549d1349..9e59ebc0 100644 --- a/src/testing.zig +++ b/src/testing.zig @@ -338,12 +338,21 @@ pub var test_notification: *Notification = undefined; pub var test_session: *Session = undefined; const WEB_API_TEST_ROOT = "src/browser/tests/"; -const HtmlRunnerOpts = struct {}; +const HtmlRunnerOpts = struct { + timeout_ms: u32 = 2000, + inject_script: ?[]const u8 = null, +}; pub fn htmlRunner(comptime path: []const u8, opts: HtmlRunnerOpts) !void { - _ = opts; defer reset(); + var inject_scripts: [1][]const u8 = undefined; + if (opts.inject_script) |script| { + inject_scripts[0] = script; + test_session.inject_scripts = inject_scripts[0..1]; + } + defer test_session.inject_scripts = &.{}; + const root = try std.fs.path.joinZ(arena_allocator, &.{ WEB_API_TEST_ROOT, path }); const stat = std.fs.cwd().statFile(root) catch |err| { std.debug.print("Failed to stat file: '{s}'", .{root}); @@ -356,7 +365,7 @@ pub fn htmlRunner(comptime path: []const u8, opts: HtmlRunnerOpts) !void { return; } try @import("root").subtest(root); - try runWebApiTest(root); + try runWebApiTest(root, opts.timeout_ms); }, .directory => { var dir = try std.fs.cwd().openDir(root, .{ @@ -382,7 +391,7 @@ pub fn htmlRunner(comptime path: []const u8, opts: HtmlRunnerOpts) !void { const full_path = try std.fs.path.joinZ(arena_allocator, &.{ root, entry.name }); try @import("root").subtest(entry.name); - try runWebApiTest(full_path); + try runWebApiTest(full_path, opts.timeout_ms); } }, else => |kind| { @@ -392,7 +401,7 @@ pub fn htmlRunner(comptime path: []const u8, opts: HtmlRunnerOpts) !void { } } -fn runWebApiTest(test_file: [:0]const u8) !void { +fn runWebApiTest(test_file: [:0]const u8, timeout_ms: u32) !void { const frame = try test_session.createPage(); defer test_session.removePage(); @@ -418,7 +427,7 @@ fn runWebApiTest(test_file: [:0]const u8) !void { var runner = try test_session.runner(.{}); try runner.wait(.{ .ms = 2000, .until = .load }); - var wait_ms: u32 = 2000; + var wait_ms: u32 = timeout_ms; var timer = try std.time.Timer.start(); while (true) { var try_catch: js.TryCatch = undefined;