refactor: unify triple-quote logic and improve schema validation

Introduces `QuoteType` enum and `splitHead` helper to centralize parsing logic. Updates `extractSchema` to use `std.json.validate` for efficiency and simplifies script construction.
2026-06-11 09:35:59 -04:00 · 2026-05-12 20:09:47 +02:00
parent 884b9cccfd
commit 40b63306e4
2 changed files with 72 additions and 68 deletions
--- a/src/browser/tools.zig
+++ b/src/browser/tools.zig
@@ -457,47 +457,40 @@ pub fn extractText(
    return runEval(arena, page, eval_script);
 }

-/// Schema-driven extraction. The schema is parsed in Zig so syntax errors
-/// point at the user's schema instead of dumping the walker JS into a V8
-/// SyntaxError; the parsed value is discarded and the raw text is spliced
-/// into the walker for a single atomic eval. See `schema_walker_prefix` for
-/// the supported schema shape.
+/// Schema-driven extraction. The schema is parsed in Zig so a syntax error
+/// surfaces here instead of as a confusing V8 SyntaxError on the spliced
+/// walker. Each value in the schema object is one of:
+///   "sel"                → first match's textContent.trim() (string|null)
+///   ""                   → matched element's own textContent.trim()
+///   ["sel"]              → all matches' textContent (string[])
+///   {selector, attr}     → first match's attribute (string|null)
+///   [{selector, attr}]   → all matches' attributes (string[])
+///   [{selector, fields}] → all matches, with `fields` relative to each (object[])
 pub fn extractSchema(
    arena: std.mem.Allocator,
    session: *lp.Session,
    registry: *CDPNode.Registry,
    schema_json: []const u8,
 ) EvalResult {
-    const parsed = std.json.parseFromSliceLeaky(std.json.Value, arena, schema_json, .{}) catch |err| {
-        const msg = std.fmt.allocPrint(arena, "Error: invalid EXTRACT schema JSON: {s}", .{@errorName(err)}) catch
-            return .{ .text = "Error: invalid EXTRACT schema JSON", .is_error = true };
-        return .{ .text = msg, .is_error = true };
-    };
-    if (parsed != .object) {
+    const trimmed = std.mem.trim(u8, schema_json, &std.ascii.whitespace);
+    if (trimmed.len == 0 or trimmed[0] != '{') {
        return .{ .text = "Error: EXTRACT schema must be a JSON object", .is_error = true };
    }
-
-    const buf = arena.allocSentinel(u8, schema_walker_prefix.len + schema_json.len + schema_walker_suffix.len, 0) catch
+    const valid = std.json.validate(arena, schema_json) catch
        return .{ .text = "Error: out of memory", .is_error = true };
-    @memcpy(buf[0..schema_walker_prefix.len], schema_walker_prefix);
-    @memcpy(buf[schema_walker_prefix.len..][0..schema_json.len], schema_json);
-    @memcpy(buf[schema_walker_prefix.len + schema_json.len ..][0..schema_walker_suffix.len], schema_walker_suffix);
+    if (!valid) {
+        return .{ .text = "Error: invalid EXTRACT schema JSON", .is_error = true };
+    }

+    const script = std.mem.concatWithSentinel(arena, u8, &.{ schema_walker_prefix, schema_json, schema_walker_suffix }, 0) catch
+        return .{ .text = "Error: out of memory", .is_error = true };
    const page = ensurePage(session, registry, null, null, null) catch
        return .{ .text = "Error: page not loaded", .is_error = true };
-    return runEval(arena, page, buf);
+    return runEval(arena, page, script);
 }

-// Schema shape — each value in the user's JSON object is one of:
-//   "sel"                   → first match's textContent.trim() (string|null)
-//   ""                      → matched element's own textContent.trim()
-//   ["sel"]                 → all matches' textContent (string[])
-//   {selector, attr}        → first match's attribute (string|null)
-//   [{selector, attr}]      → all matches' attributes (string[])
-//   [{selector, fields}]    → all matches, with `fields` evaluated relative
-//                             to each match (object[])
-// The schema literal is spliced between prefix and suffix verbatim — using a
-// format string here would collide with the many `{`/`}` in the walker body.
+// The schema literal is spliced between prefix and suffix verbatim — a format
+// string here would collide with the `{`/`}` throughout the walker body.
 const schema_walker_prefix =
    \\JSON.stringify((function(schema){
    \\  function valueOf(m, inner){
--- a/src/script/Command.zig
+++ b/src/script/Command.zig
@@ -102,8 +102,6 @@ pub const Command = union(enum) {
    }
 };

-/// Emit `KEYWORD '<body>'` for single-line bodies, or the triple-quote block
-/// form for bodies that contain newlines. Used by EVAL and EXTRACT.
 fn writeBlockOrInline(writer: *std.Io.Writer, keyword: []const u8, body: []const u8) std.Io.Writer.Error!void {
    if (std.mem.indexOfScalar(u8, body, '\n') != null) {
        try writer.print("{s} '''\n{s}\n'''", .{ keyword, body });
@@ -112,6 +110,14 @@ fn writeBlockOrInline(writer: *std.Io.Writer, keyword: []const u8, body: []const
    }
 }

+fn splitHead(line: []const u8) struct { head: []const u8, rest: []const u8 } {
+    const end = std.mem.indexOfAny(u8, line, &std.ascii.whitespace) orelse line.len;
+    return .{
+        .head = line[0..end],
+        .rest = std.mem.trim(u8, line[end..], &std.ascii.whitespace),
+    };
+}
+
 /// Parse a line of REPL input into a PandaScript command.
 /// Unrecognized input is returned as `.natural_language`.
 /// For multi-line EVAL blocks in scripts, use `ScriptParser`.
@@ -121,9 +127,9 @@ pub fn parse(line: []const u8) Command {

    if (trimmed[0] == '#') return .{ .comment = {} };

-    const cmd_end = std.mem.indexOfAny(u8, trimmed, &std.ascii.whitespace) orelse trimmed.len;
-    const cmd_word = trimmed[0..cmd_end];
-    const rest = std.mem.trim(u8, trimmed[cmd_end..], &std.ascii.whitespace);
+    const split = splitHead(trimmed);
+    const cmd_word = split.head;
+    const rest = split.rest;

    if (std.mem.eql(u8, cmd_word, "GOTO")) {
        if (rest.len == 0) return .{ .natural_language = trimmed };
@@ -285,10 +291,11 @@ pub fn analyzePandaBody(body: []const u8) BodyCursor {
        if (i >= body.len) break;
        const ch = body[i];
        if (ch == '\'' or ch == '"') {
-            if (tripleQuotePrefix(body[i..])) |tq| {
-                const end_idx = std.mem.indexOfPos(u8, body, i + tq.len, tq) orelse
+            if (QuoteType.fromPrefix(body[i..])) |tq| {
+                const lit = tq.toLiteral();
+                const end_idx = std.mem.indexOfPos(u8, body, i + lit.len, lit) orelse
                    return .{ .complete_args = complete, .at_boundary = false };
-                i = end_idx + tq.len;
+                i = end_idx + lit.len;
            } else {
                const end_idx = std.mem.indexOfScalarPos(u8, body, i + 1, ch) orelse
                    return .{ .complete_args = complete, .at_boundary = false };
@@ -309,9 +316,7 @@ pub fn analyzePandaBody(body: []const u8) BodyCursor {
 /// rejects a line whose first word *looked* like a command — either an argful
 /// keyword missing its args, or an argless keyword followed by junk.
 pub fn keywordSyntax(line: []const u8) ?KeywordSyntax {
-    const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace);
-    const end = std.mem.indexOfAny(u8, trimmed, &std.ascii.whitespace) orelse trimmed.len;
-    const word = trimmed[0..end];
+    const word = splitHead(std.mem.trim(u8, line, &std.ascii.whitespace)).head;
    for (keywords) |kc| {
        if (std.mem.eql(u8, word, kc.name)) return kc;
    }
@@ -355,12 +360,9 @@ pub const ScriptIterator = struct {
                const start_line = self.line_num;
                const body_or_null = self.collectMultiLineBlock(opener.quote_type);
                const span_end = self.lines.index orelse self.lines.buffer.len;
-                const cmd: Command = if (body_or_null) |body| switch (opener.kind) {
-                    .eval => .{ .eval_js = body },
-                    .extract => .{ .extract = body },
-                } else switch (opener.kind) {
-                    .eval => .{ .natural_language = "unterminated EVAL block" },
-                    .extract => .{ .natural_language = "unterminated EXTRACT block" },
+                const cmd: Command = switch (opener.kind) {
+                    .eval => if (body_or_null) |body| .{ .eval_js = body } else .{ .natural_language = "unterminated EVAL block" },
+                    .extract => if (body_or_null) |body| .{ .extract = body } else .{ .natural_language = "unterminated EXTRACT block" },
                };
                return .{
                    .line_num = start_line,
@@ -385,30 +387,24 @@ pub const ScriptIterator = struct {
    /// `EVAL '''a'''` fall through to single-line `parse()`.
    const BlockKeyword = struct {
        kind: enum { eval, extract },
-        quote_type: []const u8,
+        quote_type: QuoteType,

        fn fromOpener(line: []const u8) ?BlockKeyword {
-            const cmd_end = std.mem.indexOfAny(u8, line, &std.ascii.whitespace) orelse return null;
-            const cmd_word = line[0..cmd_end];
-            const rest = std.mem.trim(u8, line[cmd_end..], &std.ascii.whitespace);
-            const quote_type: []const u8 = if (std.mem.eql(u8, rest, "\"\"\""))
-                "\"\"\""
-            else if (std.mem.eql(u8, rest, "'''"))
-                "'''"
-            else
-                return null;
-            if (std.mem.eql(u8, cmd_word, "EVAL")) return .{ .kind = .eval, .quote_type = quote_type };
-            if (std.mem.eql(u8, cmd_word, "EXTRACT")) return .{ .kind = .extract, .quote_type = quote_type };
+            const split = splitHead(line);
+            const quote_type = QuoteType.fromLiteral(split.rest) orelse return null;
+            if (std.mem.eql(u8, split.head, "EVAL")) return .{ .kind = .eval, .quote_type = quote_type };
+            if (std.mem.eql(u8, split.head, "EXTRACT")) return .{ .kind = .extract, .quote_type = quote_type };
            return null;
        }
    };

-    fn collectMultiLineBlock(self: *ScriptIterator, quote_type: []const u8) ?[]const u8 {
+    fn collectMultiLineBlock(self: *ScriptIterator, quote_type: QuoteType) ?[]const u8 {
+        const closer = quote_type.toLiteral();
        var parts: std.ArrayList(u8) = .empty;
        while (self.lines.next()) |line| {
            self.line_num += 1;
            const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace);
-            if (std.mem.eql(u8, trimmed, quote_type)) {
+            if (std.mem.eql(u8, trimmed, closer)) {
                return parts.toOwnedSlice(self.allocator) catch null;
            }
            if (parts.items.len > 0) {
@@ -426,18 +422,33 @@ const QuotedResult = struct {
    remainder: []const u8,
 };

-/// Returns the opening `'''` or `"""` delimiter if `s` starts with one, else null.
-fn tripleQuotePrefix(s: []const u8) ?[]const u8 {
-    if (std.mem.startsWith(u8, s, "'''")) return "'''";
-    if (std.mem.startsWith(u8, s, "\"\"\"")) return "\"\"\"";
-    return null;
-}
+const QuoteType = enum {
+    triple_double,
+    triple_single,
+
+    fn fromLiteral(s: []const u8) ?QuoteType {
+        return if (s.len == 3) fromPrefix(s) else null;
+    }
+
+    fn fromPrefix(s: []const u8) ?QuoteType {
+        if (std.mem.startsWith(u8, s, "\"\"\"")) return .triple_double;
+        if (std.mem.startsWith(u8, s, "'''")) return .triple_single;
+        return null;
+    }
+
+    fn toLiteral(self: QuoteType) []const u8 {
+        return switch (self) {
+            .triple_double => "\"\"\"",
+            .triple_single => "'''",
+        };
+    }
+};

 fn extractQuotedWithRemainder(s: []const u8) ?QuotedResult {
    if (s.len < 2) return null;

-    if (tripleQuotePrefix(s)) |tq| {
-        const end = std.mem.indexOf(u8, s[3..], tq) orelse return null;
+    if (QuoteType.fromPrefix(s)) |tq| {
+        const end = std.mem.indexOf(u8, s[3..], tq.toLiteral()) orelse return null;
        return .{
            .value = s[3 .. 3 + end],
            .remainder = s[3 + end + 3 ..],
@@ -461,8 +472,8 @@ fn extractQuotedWithRemainder(s: []const u8) ?QuotedResult {
 fn trimMatchingQuotes(s: []const u8) ?[]const u8 {
    if (s.len == 0) return null;

-    if (tripleQuotePrefix(s)) |tq| {
-        if (s.len < 6 or !std.mem.endsWith(u8, s, tq)) return null;
+    if (QuoteType.fromPrefix(s)) |tq| {
+        if (s.len < 6 or !std.mem.endsWith(u8, s, tq.toLiteral())) return null;
        const inner = s[3 .. s.len - 3];
        return if (inner.len == 0) null else inner;
    }