mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-06-11 09:35:59 -04:00
refactor: unify triple-quote logic and improve schema validation
Introduces `QuoteType` enum and `splitHead` helper to centralize parsing logic. Updates `extractSchema` to use `std.json.validate` for efficiency and simplifies script construction.
This commit is contained in:
@@ -457,47 +457,40 @@ pub fn extractText(
|
||||
return runEval(arena, page, eval_script);
|
||||
}
|
||||
|
||||
/// Schema-driven extraction. The schema is parsed in Zig so syntax errors
|
||||
/// point at the user's schema instead of dumping the walker JS into a V8
|
||||
/// SyntaxError; the parsed value is discarded and the raw text is spliced
|
||||
/// into the walker for a single atomic eval. See `schema_walker_prefix` for
|
||||
/// the supported schema shape.
|
||||
/// Schema-driven extraction. The schema is parsed in Zig so a syntax error
|
||||
/// surfaces here instead of as a confusing V8 SyntaxError on the spliced
|
||||
/// walker. Each value in the schema object is one of:
|
||||
/// "sel" → first match's textContent.trim() (string|null)
|
||||
/// "" → matched element's own textContent.trim()
|
||||
/// ["sel"] → all matches' textContent (string[])
|
||||
/// {selector, attr} → first match's attribute (string|null)
|
||||
/// [{selector, attr}] → all matches' attributes (string[])
|
||||
/// [{selector, fields}] → all matches, with `fields` relative to each (object[])
|
||||
pub fn extractSchema(
|
||||
arena: std.mem.Allocator,
|
||||
session: *lp.Session,
|
||||
registry: *CDPNode.Registry,
|
||||
schema_json: []const u8,
|
||||
) EvalResult {
|
||||
const parsed = std.json.parseFromSliceLeaky(std.json.Value, arena, schema_json, .{}) catch |err| {
|
||||
const msg = std.fmt.allocPrint(arena, "Error: invalid EXTRACT schema JSON: {s}", .{@errorName(err)}) catch
|
||||
return .{ .text = "Error: invalid EXTRACT schema JSON", .is_error = true };
|
||||
return .{ .text = msg, .is_error = true };
|
||||
};
|
||||
if (parsed != .object) {
|
||||
const trimmed = std.mem.trim(u8, schema_json, &std.ascii.whitespace);
|
||||
if (trimmed.len == 0 or trimmed[0] != '{') {
|
||||
return .{ .text = "Error: EXTRACT schema must be a JSON object", .is_error = true };
|
||||
}
|
||||
|
||||
const buf = arena.allocSentinel(u8, schema_walker_prefix.len + schema_json.len + schema_walker_suffix.len, 0) catch
|
||||
const valid = std.json.validate(arena, schema_json) catch
|
||||
return .{ .text = "Error: out of memory", .is_error = true };
|
||||
@memcpy(buf[0..schema_walker_prefix.len], schema_walker_prefix);
|
||||
@memcpy(buf[schema_walker_prefix.len..][0..schema_json.len], schema_json);
|
||||
@memcpy(buf[schema_walker_prefix.len + schema_json.len ..][0..schema_walker_suffix.len], schema_walker_suffix);
|
||||
if (!valid) {
|
||||
return .{ .text = "Error: invalid EXTRACT schema JSON", .is_error = true };
|
||||
}
|
||||
|
||||
const script = std.mem.concatWithSentinel(arena, u8, &.{ schema_walker_prefix, schema_json, schema_walker_suffix }, 0) catch
|
||||
return .{ .text = "Error: out of memory", .is_error = true };
|
||||
const page = ensurePage(session, registry, null, null, null) catch
|
||||
return .{ .text = "Error: page not loaded", .is_error = true };
|
||||
return runEval(arena, page, buf);
|
||||
return runEval(arena, page, script);
|
||||
}
|
||||
|
||||
// Schema shape — each value in the user's JSON object is one of:
|
||||
// "sel" → first match's textContent.trim() (string|null)
|
||||
// "" → matched element's own textContent.trim()
|
||||
// ["sel"] → all matches' textContent (string[])
|
||||
// {selector, attr} → first match's attribute (string|null)
|
||||
// [{selector, attr}] → all matches' attributes (string[])
|
||||
// [{selector, fields}] → all matches, with `fields` evaluated relative
|
||||
// to each match (object[])
|
||||
// The schema literal is spliced between prefix and suffix verbatim — using a
|
||||
// format string here would collide with the many `{`/`}` in the walker body.
|
||||
// The schema literal is spliced between prefix and suffix verbatim — a format
|
||||
// string here would collide with the `{`/`}` throughout the walker body.
|
||||
const schema_walker_prefix =
|
||||
\\JSON.stringify((function(schema){
|
||||
\\ function valueOf(m, inner){
|
||||
|
||||
@@ -102,8 +102,6 @@ pub const Command = union(enum) {
|
||||
}
|
||||
};
|
||||
|
||||
/// Emit `KEYWORD '<body>'` for single-line bodies, or the triple-quote block
|
||||
/// form for bodies that contain newlines. Used by EVAL and EXTRACT.
|
||||
fn writeBlockOrInline(writer: *std.Io.Writer, keyword: []const u8, body: []const u8) std.Io.Writer.Error!void {
|
||||
if (std.mem.indexOfScalar(u8, body, '\n') != null) {
|
||||
try writer.print("{s} '''\n{s}\n'''", .{ keyword, body });
|
||||
@@ -112,6 +110,14 @@ fn writeBlockOrInline(writer: *std.Io.Writer, keyword: []const u8, body: []const
|
||||
}
|
||||
}
|
||||
|
||||
fn splitHead(line: []const u8) struct { head: []const u8, rest: []const u8 } {
|
||||
const end = std.mem.indexOfAny(u8, line, &std.ascii.whitespace) orelse line.len;
|
||||
return .{
|
||||
.head = line[0..end],
|
||||
.rest = std.mem.trim(u8, line[end..], &std.ascii.whitespace),
|
||||
};
|
||||
}
|
||||
|
||||
/// Parse a line of REPL input into a PandaScript command.
|
||||
/// Unrecognized input is returned as `.natural_language`.
|
||||
/// For multi-line EVAL blocks in scripts, use `ScriptParser`.
|
||||
@@ -121,9 +127,9 @@ pub fn parse(line: []const u8) Command {
|
||||
|
||||
if (trimmed[0] == '#') return .{ .comment = {} };
|
||||
|
||||
const cmd_end = std.mem.indexOfAny(u8, trimmed, &std.ascii.whitespace) orelse trimmed.len;
|
||||
const cmd_word = trimmed[0..cmd_end];
|
||||
const rest = std.mem.trim(u8, trimmed[cmd_end..], &std.ascii.whitespace);
|
||||
const split = splitHead(trimmed);
|
||||
const cmd_word = split.head;
|
||||
const rest = split.rest;
|
||||
|
||||
if (std.mem.eql(u8, cmd_word, "GOTO")) {
|
||||
if (rest.len == 0) return .{ .natural_language = trimmed };
|
||||
@@ -285,10 +291,11 @@ pub fn analyzePandaBody(body: []const u8) BodyCursor {
|
||||
if (i >= body.len) break;
|
||||
const ch = body[i];
|
||||
if (ch == '\'' or ch == '"') {
|
||||
if (tripleQuotePrefix(body[i..])) |tq| {
|
||||
const end_idx = std.mem.indexOfPos(u8, body, i + tq.len, tq) orelse
|
||||
if (QuoteType.fromPrefix(body[i..])) |tq| {
|
||||
const lit = tq.toLiteral();
|
||||
const end_idx = std.mem.indexOfPos(u8, body, i + lit.len, lit) orelse
|
||||
return .{ .complete_args = complete, .at_boundary = false };
|
||||
i = end_idx + tq.len;
|
||||
i = end_idx + lit.len;
|
||||
} else {
|
||||
const end_idx = std.mem.indexOfScalarPos(u8, body, i + 1, ch) orelse
|
||||
return .{ .complete_args = complete, .at_boundary = false };
|
||||
@@ -309,9 +316,7 @@ pub fn analyzePandaBody(body: []const u8) BodyCursor {
|
||||
/// rejects a line whose first word *looked* like a command — either an argful
|
||||
/// keyword missing its args, or an argless keyword followed by junk.
|
||||
pub fn keywordSyntax(line: []const u8) ?KeywordSyntax {
|
||||
const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace);
|
||||
const end = std.mem.indexOfAny(u8, trimmed, &std.ascii.whitespace) orelse trimmed.len;
|
||||
const word = trimmed[0..end];
|
||||
const word = splitHead(std.mem.trim(u8, line, &std.ascii.whitespace)).head;
|
||||
for (keywords) |kc| {
|
||||
if (std.mem.eql(u8, word, kc.name)) return kc;
|
||||
}
|
||||
@@ -355,12 +360,9 @@ pub const ScriptIterator = struct {
|
||||
const start_line = self.line_num;
|
||||
const body_or_null = self.collectMultiLineBlock(opener.quote_type);
|
||||
const span_end = self.lines.index orelse self.lines.buffer.len;
|
||||
const cmd: Command = if (body_or_null) |body| switch (opener.kind) {
|
||||
.eval => .{ .eval_js = body },
|
||||
.extract => .{ .extract = body },
|
||||
} else switch (opener.kind) {
|
||||
.eval => .{ .natural_language = "unterminated EVAL block" },
|
||||
.extract => .{ .natural_language = "unterminated EXTRACT block" },
|
||||
const cmd: Command = switch (opener.kind) {
|
||||
.eval => if (body_or_null) |body| .{ .eval_js = body } else .{ .natural_language = "unterminated EVAL block" },
|
||||
.extract => if (body_or_null) |body| .{ .extract = body } else .{ .natural_language = "unterminated EXTRACT block" },
|
||||
};
|
||||
return .{
|
||||
.line_num = start_line,
|
||||
@@ -385,30 +387,24 @@ pub const ScriptIterator = struct {
|
||||
/// `EVAL '''a'''` fall through to single-line `parse()`.
|
||||
const BlockKeyword = struct {
|
||||
kind: enum { eval, extract },
|
||||
quote_type: []const u8,
|
||||
quote_type: QuoteType,
|
||||
|
||||
fn fromOpener(line: []const u8) ?BlockKeyword {
|
||||
const cmd_end = std.mem.indexOfAny(u8, line, &std.ascii.whitespace) orelse return null;
|
||||
const cmd_word = line[0..cmd_end];
|
||||
const rest = std.mem.trim(u8, line[cmd_end..], &std.ascii.whitespace);
|
||||
const quote_type: []const u8 = if (std.mem.eql(u8, rest, "\"\"\""))
|
||||
"\"\"\""
|
||||
else if (std.mem.eql(u8, rest, "'''"))
|
||||
"'''"
|
||||
else
|
||||
return null;
|
||||
if (std.mem.eql(u8, cmd_word, "EVAL")) return .{ .kind = .eval, .quote_type = quote_type };
|
||||
if (std.mem.eql(u8, cmd_word, "EXTRACT")) return .{ .kind = .extract, .quote_type = quote_type };
|
||||
const split = splitHead(line);
|
||||
const quote_type = QuoteType.fromLiteral(split.rest) orelse return null;
|
||||
if (std.mem.eql(u8, split.head, "EVAL")) return .{ .kind = .eval, .quote_type = quote_type };
|
||||
if (std.mem.eql(u8, split.head, "EXTRACT")) return .{ .kind = .extract, .quote_type = quote_type };
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
fn collectMultiLineBlock(self: *ScriptIterator, quote_type: []const u8) ?[]const u8 {
|
||||
fn collectMultiLineBlock(self: *ScriptIterator, quote_type: QuoteType) ?[]const u8 {
|
||||
const closer = quote_type.toLiteral();
|
||||
var parts: std.ArrayList(u8) = .empty;
|
||||
while (self.lines.next()) |line| {
|
||||
self.line_num += 1;
|
||||
const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace);
|
||||
if (std.mem.eql(u8, trimmed, quote_type)) {
|
||||
if (std.mem.eql(u8, trimmed, closer)) {
|
||||
return parts.toOwnedSlice(self.allocator) catch null;
|
||||
}
|
||||
if (parts.items.len > 0) {
|
||||
@@ -426,18 +422,33 @@ const QuotedResult = struct {
|
||||
remainder: []const u8,
|
||||
};
|
||||
|
||||
/// Returns the opening `'''` or `"""` delimiter if `s` starts with one, else null.
|
||||
fn tripleQuotePrefix(s: []const u8) ?[]const u8 {
|
||||
if (std.mem.startsWith(u8, s, "'''")) return "'''";
|
||||
if (std.mem.startsWith(u8, s, "\"\"\"")) return "\"\"\"";
|
||||
return null;
|
||||
}
|
||||
const QuoteType = enum {
|
||||
triple_double,
|
||||
triple_single,
|
||||
|
||||
fn fromLiteral(s: []const u8) ?QuoteType {
|
||||
return if (s.len == 3) fromPrefix(s) else null;
|
||||
}
|
||||
|
||||
fn fromPrefix(s: []const u8) ?QuoteType {
|
||||
if (std.mem.startsWith(u8, s, "\"\"\"")) return .triple_double;
|
||||
if (std.mem.startsWith(u8, s, "'''")) return .triple_single;
|
||||
return null;
|
||||
}
|
||||
|
||||
fn toLiteral(self: QuoteType) []const u8 {
|
||||
return switch (self) {
|
||||
.triple_double => "\"\"\"",
|
||||
.triple_single => "'''",
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
fn extractQuotedWithRemainder(s: []const u8) ?QuotedResult {
|
||||
if (s.len < 2) return null;
|
||||
|
||||
if (tripleQuotePrefix(s)) |tq| {
|
||||
const end = std.mem.indexOf(u8, s[3..], tq) orelse return null;
|
||||
if (QuoteType.fromPrefix(s)) |tq| {
|
||||
const end = std.mem.indexOf(u8, s[3..], tq.toLiteral()) orelse return null;
|
||||
return .{
|
||||
.value = s[3 .. 3 + end],
|
||||
.remainder = s[3 + end + 3 ..],
|
||||
@@ -461,8 +472,8 @@ fn extractQuotedWithRemainder(s: []const u8) ?QuotedResult {
|
||||
fn trimMatchingQuotes(s: []const u8) ?[]const u8 {
|
||||
if (s.len == 0) return null;
|
||||
|
||||
if (tripleQuotePrefix(s)) |tq| {
|
||||
if (s.len < 6 or !std.mem.endsWith(u8, s, tq)) return null;
|
||||
if (QuoteType.fromPrefix(s)) |tq| {
|
||||
if (s.len < 6 or !std.mem.endsWith(u8, s, tq.toLiteral())) return null;
|
||||
const inner = s[3 .. s.len - 3];
|
||||
return if (inner.len == 0) null else inner;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user