From 40b63306e485a58cf9e2dd4e6a91794fc1ac2d05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Arrufat?= Date: Tue, 12 May 2026 20:09:47 +0200 Subject: [PATCH] refactor: unify triple-quote logic and improve schema validation Introduces `QuoteType` enum and `splitHead` helper to centralize parsing logic. Updates `extractSchema` to use `std.json.validate` for efficiency and simplifies script construction. --- src/browser/tools.zig | 47 +++++++++------------ src/script/Command.zig | 93 +++++++++++++++++++++++------------------- 2 files changed, 72 insertions(+), 68 deletions(-) diff --git a/src/browser/tools.zig b/src/browser/tools.zig index 4f86781a..5b45ed7a 100644 --- a/src/browser/tools.zig +++ b/src/browser/tools.zig @@ -457,47 +457,40 @@ pub fn extractText( return runEval(arena, page, eval_script); } -/// Schema-driven extraction. The schema is parsed in Zig so syntax errors -/// point at the user's schema instead of dumping the walker JS into a V8 -/// SyntaxError; the parsed value is discarded and the raw text is spliced -/// into the walker for a single atomic eval. See `schema_walker_prefix` for -/// the supported schema shape. +/// Schema-driven extraction. The schema is parsed in Zig so a syntax error +/// surfaces here instead of as a confusing V8 SyntaxError on the spliced +/// walker. Each value in the schema object is one of: +/// "sel" → first match's textContent.trim() (string|null) +/// "" → matched element's own textContent.trim() +/// ["sel"] → all matches' textContent (string[]) +/// {selector, attr} → first match's attribute (string|null) +/// [{selector, attr}] → all matches' attributes (string[]) +/// [{selector, fields}] → all matches, with `fields` relative to each (object[]) pub fn extractSchema( arena: std.mem.Allocator, session: *lp.Session, registry: *CDPNode.Registry, schema_json: []const u8, ) EvalResult { - const parsed = std.json.parseFromSliceLeaky(std.json.Value, arena, schema_json, .{}) catch |err| { - const msg = std.fmt.allocPrint(arena, "Error: invalid EXTRACT schema JSON: {s}", .{@errorName(err)}) catch - return .{ .text = "Error: invalid EXTRACT schema JSON", .is_error = true }; - return .{ .text = msg, .is_error = true }; - }; - if (parsed != .object) { + const trimmed = std.mem.trim(u8, schema_json, &std.ascii.whitespace); + if (trimmed.len == 0 or trimmed[0] != '{') { return .{ .text = "Error: EXTRACT schema must be a JSON object", .is_error = true }; } - - const buf = arena.allocSentinel(u8, schema_walker_prefix.len + schema_json.len + schema_walker_suffix.len, 0) catch + const valid = std.json.validate(arena, schema_json) catch return .{ .text = "Error: out of memory", .is_error = true }; - @memcpy(buf[0..schema_walker_prefix.len], schema_walker_prefix); - @memcpy(buf[schema_walker_prefix.len..][0..schema_json.len], schema_json); - @memcpy(buf[schema_walker_prefix.len + schema_json.len ..][0..schema_walker_suffix.len], schema_walker_suffix); + if (!valid) { + return .{ .text = "Error: invalid EXTRACT schema JSON", .is_error = true }; + } + const script = std.mem.concatWithSentinel(arena, u8, &.{ schema_walker_prefix, schema_json, schema_walker_suffix }, 0) catch + return .{ .text = "Error: out of memory", .is_error = true }; const page = ensurePage(session, registry, null, null, null) catch return .{ .text = "Error: page not loaded", .is_error = true }; - return runEval(arena, page, buf); + return runEval(arena, page, script); } -// Schema shape — each value in the user's JSON object is one of: -// "sel" → first match's textContent.trim() (string|null) -// "" → matched element's own textContent.trim() -// ["sel"] → all matches' textContent (string[]) -// {selector, attr} → first match's attribute (string|null) -// [{selector, attr}] → all matches' attributes (string[]) -// [{selector, fields}] → all matches, with `fields` evaluated relative -// to each match (object[]) -// The schema literal is spliced between prefix and suffix verbatim — using a -// format string here would collide with the many `{`/`}` in the walker body. +// The schema literal is spliced between prefix and suffix verbatim — a format +// string here would collide with the `{`/`}` throughout the walker body. const schema_walker_prefix = \\JSON.stringify((function(schema){ \\ function valueOf(m, inner){ diff --git a/src/script/Command.zig b/src/script/Command.zig index 57f9858b..69a6430e 100644 --- a/src/script/Command.zig +++ b/src/script/Command.zig @@ -102,8 +102,6 @@ pub const Command = union(enum) { } }; -/// Emit `KEYWORD ''` for single-line bodies, or the triple-quote block -/// form for bodies that contain newlines. Used by EVAL and EXTRACT. fn writeBlockOrInline(writer: *std.Io.Writer, keyword: []const u8, body: []const u8) std.Io.Writer.Error!void { if (std.mem.indexOfScalar(u8, body, '\n') != null) { try writer.print("{s} '''\n{s}\n'''", .{ keyword, body }); @@ -112,6 +110,14 @@ fn writeBlockOrInline(writer: *std.Io.Writer, keyword: []const u8, body: []const } } +fn splitHead(line: []const u8) struct { head: []const u8, rest: []const u8 } { + const end = std.mem.indexOfAny(u8, line, &std.ascii.whitespace) orelse line.len; + return .{ + .head = line[0..end], + .rest = std.mem.trim(u8, line[end..], &std.ascii.whitespace), + }; +} + /// Parse a line of REPL input into a PandaScript command. /// Unrecognized input is returned as `.natural_language`. /// For multi-line EVAL blocks in scripts, use `ScriptParser`. @@ -121,9 +127,9 @@ pub fn parse(line: []const u8) Command { if (trimmed[0] == '#') return .{ .comment = {} }; - const cmd_end = std.mem.indexOfAny(u8, trimmed, &std.ascii.whitespace) orelse trimmed.len; - const cmd_word = trimmed[0..cmd_end]; - const rest = std.mem.trim(u8, trimmed[cmd_end..], &std.ascii.whitespace); + const split = splitHead(trimmed); + const cmd_word = split.head; + const rest = split.rest; if (std.mem.eql(u8, cmd_word, "GOTO")) { if (rest.len == 0) return .{ .natural_language = trimmed }; @@ -285,10 +291,11 @@ pub fn analyzePandaBody(body: []const u8) BodyCursor { if (i >= body.len) break; const ch = body[i]; if (ch == '\'' or ch == '"') { - if (tripleQuotePrefix(body[i..])) |tq| { - const end_idx = std.mem.indexOfPos(u8, body, i + tq.len, tq) orelse + if (QuoteType.fromPrefix(body[i..])) |tq| { + const lit = tq.toLiteral(); + const end_idx = std.mem.indexOfPos(u8, body, i + lit.len, lit) orelse return .{ .complete_args = complete, .at_boundary = false }; - i = end_idx + tq.len; + i = end_idx + lit.len; } else { const end_idx = std.mem.indexOfScalarPos(u8, body, i + 1, ch) orelse return .{ .complete_args = complete, .at_boundary = false }; @@ -309,9 +316,7 @@ pub fn analyzePandaBody(body: []const u8) BodyCursor { /// rejects a line whose first word *looked* like a command — either an argful /// keyword missing its args, or an argless keyword followed by junk. pub fn keywordSyntax(line: []const u8) ?KeywordSyntax { - const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace); - const end = std.mem.indexOfAny(u8, trimmed, &std.ascii.whitespace) orelse trimmed.len; - const word = trimmed[0..end]; + const word = splitHead(std.mem.trim(u8, line, &std.ascii.whitespace)).head; for (keywords) |kc| { if (std.mem.eql(u8, word, kc.name)) return kc; } @@ -355,12 +360,9 @@ pub const ScriptIterator = struct { const start_line = self.line_num; const body_or_null = self.collectMultiLineBlock(opener.quote_type); const span_end = self.lines.index orelse self.lines.buffer.len; - const cmd: Command = if (body_or_null) |body| switch (opener.kind) { - .eval => .{ .eval_js = body }, - .extract => .{ .extract = body }, - } else switch (opener.kind) { - .eval => .{ .natural_language = "unterminated EVAL block" }, - .extract => .{ .natural_language = "unterminated EXTRACT block" }, + const cmd: Command = switch (opener.kind) { + .eval => if (body_or_null) |body| .{ .eval_js = body } else .{ .natural_language = "unterminated EVAL block" }, + .extract => if (body_or_null) |body| .{ .extract = body } else .{ .natural_language = "unterminated EXTRACT block" }, }; return .{ .line_num = start_line, @@ -385,30 +387,24 @@ pub const ScriptIterator = struct { /// `EVAL '''a'''` fall through to single-line `parse()`. const BlockKeyword = struct { kind: enum { eval, extract }, - quote_type: []const u8, + quote_type: QuoteType, fn fromOpener(line: []const u8) ?BlockKeyword { - const cmd_end = std.mem.indexOfAny(u8, line, &std.ascii.whitespace) orelse return null; - const cmd_word = line[0..cmd_end]; - const rest = std.mem.trim(u8, line[cmd_end..], &std.ascii.whitespace); - const quote_type: []const u8 = if (std.mem.eql(u8, rest, "\"\"\"")) - "\"\"\"" - else if (std.mem.eql(u8, rest, "'''")) - "'''" - else - return null; - if (std.mem.eql(u8, cmd_word, "EVAL")) return .{ .kind = .eval, .quote_type = quote_type }; - if (std.mem.eql(u8, cmd_word, "EXTRACT")) return .{ .kind = .extract, .quote_type = quote_type }; + const split = splitHead(line); + const quote_type = QuoteType.fromLiteral(split.rest) orelse return null; + if (std.mem.eql(u8, split.head, "EVAL")) return .{ .kind = .eval, .quote_type = quote_type }; + if (std.mem.eql(u8, split.head, "EXTRACT")) return .{ .kind = .extract, .quote_type = quote_type }; return null; } }; - fn collectMultiLineBlock(self: *ScriptIterator, quote_type: []const u8) ?[]const u8 { + fn collectMultiLineBlock(self: *ScriptIterator, quote_type: QuoteType) ?[]const u8 { + const closer = quote_type.toLiteral(); var parts: std.ArrayList(u8) = .empty; while (self.lines.next()) |line| { self.line_num += 1; const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace); - if (std.mem.eql(u8, trimmed, quote_type)) { + if (std.mem.eql(u8, trimmed, closer)) { return parts.toOwnedSlice(self.allocator) catch null; } if (parts.items.len > 0) { @@ -426,18 +422,33 @@ const QuotedResult = struct { remainder: []const u8, }; -/// Returns the opening `'''` or `"""` delimiter if `s` starts with one, else null. -fn tripleQuotePrefix(s: []const u8) ?[]const u8 { - if (std.mem.startsWith(u8, s, "'''")) return "'''"; - if (std.mem.startsWith(u8, s, "\"\"\"")) return "\"\"\""; - return null; -} +const QuoteType = enum { + triple_double, + triple_single, + + fn fromLiteral(s: []const u8) ?QuoteType { + return if (s.len == 3) fromPrefix(s) else null; + } + + fn fromPrefix(s: []const u8) ?QuoteType { + if (std.mem.startsWith(u8, s, "\"\"\"")) return .triple_double; + if (std.mem.startsWith(u8, s, "'''")) return .triple_single; + return null; + } + + fn toLiteral(self: QuoteType) []const u8 { + return switch (self) { + .triple_double => "\"\"\"", + .triple_single => "'''", + }; + } +}; fn extractQuotedWithRemainder(s: []const u8) ?QuotedResult { if (s.len < 2) return null; - if (tripleQuotePrefix(s)) |tq| { - const end = std.mem.indexOf(u8, s[3..], tq) orelse return null; + if (QuoteType.fromPrefix(s)) |tq| { + const end = std.mem.indexOf(u8, s[3..], tq.toLiteral()) orelse return null; return .{ .value = s[3 .. 3 + end], .remainder = s[3 + end + 3 ..], @@ -461,8 +472,8 @@ fn extractQuotedWithRemainder(s: []const u8) ?QuotedResult { fn trimMatchingQuotes(s: []const u8) ?[]const u8 { if (s.len == 0) return null; - if (tripleQuotePrefix(s)) |tq| { - if (s.len < 6 or !std.mem.endsWith(u8, s, tq)) return null; + if (QuoteType.fromPrefix(s)) |tq| { + if (s.len < 6 or !std.mem.endsWith(u8, s, tq.toLiteral())) return null; const inner = s[3 .. s.len - 3]; return if (inner.len == 0) null else inner; }