refactor: unify triple-quote logic and improve schema validation

Introduces `QuoteType` enum and `splitHead` helper to centralize parsing
logic. Updates `extractSchema` to use `std.json.validate` for efficiency
and simplifies script construction.
This commit is contained in:
Adrià Arrufat
2026-05-12 20:09:47 +02:00
parent 884b9cccfd
commit 40b63306e4
2 changed files with 72 additions and 68 deletions

View File

@@ -457,47 +457,40 @@ pub fn extractText(
return runEval(arena, page, eval_script);
}
/// Schema-driven extraction. The schema is parsed in Zig so syntax errors
/// point at the user's schema instead of dumping the walker JS into a V8
/// SyntaxError; the parsed value is discarded and the raw text is spliced
/// into the walker for a single atomic eval. See `schema_walker_prefix` for
/// the supported schema shape.
/// Schema-driven extraction. The schema is parsed in Zig so a syntax error
/// surfaces here instead of as a confusing V8 SyntaxError on the spliced
/// walker. Each value in the schema object is one of:
/// "sel" → first match's textContent.trim() (string|null)
/// "" → matched element's own textContent.trim()
/// ["sel"] → all matches' textContent (string[])
/// {selector, attr} → first match's attribute (string|null)
/// [{selector, attr}] → all matches' attributes (string[])
/// [{selector, fields}] → all matches, with `fields` relative to each (object[])
pub fn extractSchema(
arena: std.mem.Allocator,
session: *lp.Session,
registry: *CDPNode.Registry,
schema_json: []const u8,
) EvalResult {
const parsed = std.json.parseFromSliceLeaky(std.json.Value, arena, schema_json, .{}) catch |err| {
const msg = std.fmt.allocPrint(arena, "Error: invalid EXTRACT schema JSON: {s}", .{@errorName(err)}) catch
return .{ .text = "Error: invalid EXTRACT schema JSON", .is_error = true };
return .{ .text = msg, .is_error = true };
};
if (parsed != .object) {
const trimmed = std.mem.trim(u8, schema_json, &std.ascii.whitespace);
if (trimmed.len == 0 or trimmed[0] != '{') {
return .{ .text = "Error: EXTRACT schema must be a JSON object", .is_error = true };
}
const buf = arena.allocSentinel(u8, schema_walker_prefix.len + schema_json.len + schema_walker_suffix.len, 0) catch
const valid = std.json.validate(arena, schema_json) catch
return .{ .text = "Error: out of memory", .is_error = true };
@memcpy(buf[0..schema_walker_prefix.len], schema_walker_prefix);
@memcpy(buf[schema_walker_prefix.len..][0..schema_json.len], schema_json);
@memcpy(buf[schema_walker_prefix.len + schema_json.len ..][0..schema_walker_suffix.len], schema_walker_suffix);
if (!valid) {
return .{ .text = "Error: invalid EXTRACT schema JSON", .is_error = true };
}
const script = std.mem.concatWithSentinel(arena, u8, &.{ schema_walker_prefix, schema_json, schema_walker_suffix }, 0) catch
return .{ .text = "Error: out of memory", .is_error = true };
const page = ensurePage(session, registry, null, null, null) catch
return .{ .text = "Error: page not loaded", .is_error = true };
return runEval(arena, page, buf);
return runEval(arena, page, script);
}
// Schema shape — each value in the user's JSON object is one of:
// "sel" → first match's textContent.trim() (string|null)
// "" → matched element's own textContent.trim()
// ["sel"] → all matches' textContent (string[])
// {selector, attr} → first match's attribute (string|null)
// [{selector, attr}] → all matches' attributes (string[])
// [{selector, fields}] → all matches, with `fields` evaluated relative
// to each match (object[])
// The schema literal is spliced between prefix and suffix verbatim — using a
// format string here would collide with the many `{`/`}` in the walker body.
// The schema literal is spliced between prefix and suffix verbatim — a format
// string here would collide with the `{`/`}` throughout the walker body.
const schema_walker_prefix =
\\JSON.stringify((function(schema){
\\ function valueOf(m, inner){

View File

@@ -102,8 +102,6 @@ pub const Command = union(enum) {
}
};
/// Emit `KEYWORD '<body>'` for single-line bodies, or the triple-quote block
/// form for bodies that contain newlines. Used by EVAL and EXTRACT.
fn writeBlockOrInline(writer: *std.Io.Writer, keyword: []const u8, body: []const u8) std.Io.Writer.Error!void {
if (std.mem.indexOfScalar(u8, body, '\n') != null) {
try writer.print("{s} '''\n{s}\n'''", .{ keyword, body });
@@ -112,6 +110,14 @@ fn writeBlockOrInline(writer: *std.Io.Writer, keyword: []const u8, body: []const
}
}
fn splitHead(line: []const u8) struct { head: []const u8, rest: []const u8 } {
const end = std.mem.indexOfAny(u8, line, &std.ascii.whitespace) orelse line.len;
return .{
.head = line[0..end],
.rest = std.mem.trim(u8, line[end..], &std.ascii.whitespace),
};
}
/// Parse a line of REPL input into a PandaScript command.
/// Unrecognized input is returned as `.natural_language`.
/// For multi-line EVAL blocks in scripts, use `ScriptParser`.
@@ -121,9 +127,9 @@ pub fn parse(line: []const u8) Command {
if (trimmed[0] == '#') return .{ .comment = {} };
const cmd_end = std.mem.indexOfAny(u8, trimmed, &std.ascii.whitespace) orelse trimmed.len;
const cmd_word = trimmed[0..cmd_end];
const rest = std.mem.trim(u8, trimmed[cmd_end..], &std.ascii.whitespace);
const split = splitHead(trimmed);
const cmd_word = split.head;
const rest = split.rest;
if (std.mem.eql(u8, cmd_word, "GOTO")) {
if (rest.len == 0) return .{ .natural_language = trimmed };
@@ -285,10 +291,11 @@ pub fn analyzePandaBody(body: []const u8) BodyCursor {
if (i >= body.len) break;
const ch = body[i];
if (ch == '\'' or ch == '"') {
if (tripleQuotePrefix(body[i..])) |tq| {
const end_idx = std.mem.indexOfPos(u8, body, i + tq.len, tq) orelse
if (QuoteType.fromPrefix(body[i..])) |tq| {
const lit = tq.toLiteral();
const end_idx = std.mem.indexOfPos(u8, body, i + lit.len, lit) orelse
return .{ .complete_args = complete, .at_boundary = false };
i = end_idx + tq.len;
i = end_idx + lit.len;
} else {
const end_idx = std.mem.indexOfScalarPos(u8, body, i + 1, ch) orelse
return .{ .complete_args = complete, .at_boundary = false };
@@ -309,9 +316,7 @@ pub fn analyzePandaBody(body: []const u8) BodyCursor {
/// rejects a line whose first word *looked* like a command — either an argful
/// keyword missing its args, or an argless keyword followed by junk.
pub fn keywordSyntax(line: []const u8) ?KeywordSyntax {
const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace);
const end = std.mem.indexOfAny(u8, trimmed, &std.ascii.whitespace) orelse trimmed.len;
const word = trimmed[0..end];
const word = splitHead(std.mem.trim(u8, line, &std.ascii.whitespace)).head;
for (keywords) |kc| {
if (std.mem.eql(u8, word, kc.name)) return kc;
}
@@ -355,12 +360,9 @@ pub const ScriptIterator = struct {
const start_line = self.line_num;
const body_or_null = self.collectMultiLineBlock(opener.quote_type);
const span_end = self.lines.index orelse self.lines.buffer.len;
const cmd: Command = if (body_or_null) |body| switch (opener.kind) {
.eval => .{ .eval_js = body },
.extract => .{ .extract = body },
} else switch (opener.kind) {
.eval => .{ .natural_language = "unterminated EVAL block" },
.extract => .{ .natural_language = "unterminated EXTRACT block" },
const cmd: Command = switch (opener.kind) {
.eval => if (body_or_null) |body| .{ .eval_js = body } else .{ .natural_language = "unterminated EVAL block" },
.extract => if (body_or_null) |body| .{ .extract = body } else .{ .natural_language = "unterminated EXTRACT block" },
};
return .{
.line_num = start_line,
@@ -385,30 +387,24 @@ pub const ScriptIterator = struct {
/// `EVAL '''a'''` fall through to single-line `parse()`.
const BlockKeyword = struct {
kind: enum { eval, extract },
quote_type: []const u8,
quote_type: QuoteType,
fn fromOpener(line: []const u8) ?BlockKeyword {
const cmd_end = std.mem.indexOfAny(u8, line, &std.ascii.whitespace) orelse return null;
const cmd_word = line[0..cmd_end];
const rest = std.mem.trim(u8, line[cmd_end..], &std.ascii.whitespace);
const quote_type: []const u8 = if (std.mem.eql(u8, rest, "\"\"\""))
"\"\"\""
else if (std.mem.eql(u8, rest, "'''"))
"'''"
else
return null;
if (std.mem.eql(u8, cmd_word, "EVAL")) return .{ .kind = .eval, .quote_type = quote_type };
if (std.mem.eql(u8, cmd_word, "EXTRACT")) return .{ .kind = .extract, .quote_type = quote_type };
const split = splitHead(line);
const quote_type = QuoteType.fromLiteral(split.rest) orelse return null;
if (std.mem.eql(u8, split.head, "EVAL")) return .{ .kind = .eval, .quote_type = quote_type };
if (std.mem.eql(u8, split.head, "EXTRACT")) return .{ .kind = .extract, .quote_type = quote_type };
return null;
}
};
fn collectMultiLineBlock(self: *ScriptIterator, quote_type: []const u8) ?[]const u8 {
fn collectMultiLineBlock(self: *ScriptIterator, quote_type: QuoteType) ?[]const u8 {
const closer = quote_type.toLiteral();
var parts: std.ArrayList(u8) = .empty;
while (self.lines.next()) |line| {
self.line_num += 1;
const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace);
if (std.mem.eql(u8, trimmed, quote_type)) {
if (std.mem.eql(u8, trimmed, closer)) {
return parts.toOwnedSlice(self.allocator) catch null;
}
if (parts.items.len > 0) {
@@ -426,18 +422,33 @@ const QuotedResult = struct {
remainder: []const u8,
};
/// Returns the opening `'''` or `"""` delimiter if `s` starts with one, else null.
fn tripleQuotePrefix(s: []const u8) ?[]const u8 {
if (std.mem.startsWith(u8, s, "'''")) return "'''";
if (std.mem.startsWith(u8, s, "\"\"\"")) return "\"\"\"";
return null;
}
const QuoteType = enum {
triple_double,
triple_single,
fn fromLiteral(s: []const u8) ?QuoteType {
return if (s.len == 3) fromPrefix(s) else null;
}
fn fromPrefix(s: []const u8) ?QuoteType {
if (std.mem.startsWith(u8, s, "\"\"\"")) return .triple_double;
if (std.mem.startsWith(u8, s, "'''")) return .triple_single;
return null;
}
fn toLiteral(self: QuoteType) []const u8 {
return switch (self) {
.triple_double => "\"\"\"",
.triple_single => "'''",
};
}
};
fn extractQuotedWithRemainder(s: []const u8) ?QuotedResult {
if (s.len < 2) return null;
if (tripleQuotePrefix(s)) |tq| {
const end = std.mem.indexOf(u8, s[3..], tq) orelse return null;
if (QuoteType.fromPrefix(s)) |tq| {
const end = std.mem.indexOf(u8, s[3..], tq.toLiteral()) orelse return null;
return .{
.value = s[3 .. 3 + end],
.remainder = s[3 + end + 3 ..],
@@ -461,8 +472,8 @@ fn extractQuotedWithRemainder(s: []const u8) ?QuotedResult {
fn trimMatchingQuotes(s: []const u8) ?[]const u8 {
if (s.len == 0) return null;
if (tripleQuotePrefix(s)) |tq| {
if (s.len < 6 or !std.mem.endsWith(u8, s, tq)) return null;
if (QuoteType.fromPrefix(s)) |tq| {
if (s.len < 6 or !std.mem.endsWith(u8, s, tq.toLiteral())) return null;
const inner = s[3 .. s.len - 3];
return if (inner.len == 0) null else inner;
}