From b141da30ca5722dc55d0a0aeec3a4250fbe5f7bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Arrufat?= Date: Tue, 9 Jun 2026 16:49:50 +0200 Subject: [PATCH] agent: verify synthesized scripts during /save Introduces a multi-step synthesis process for `/save` that derives a logical JSON output schema and uses a dry-run runtime to verify candidate scripts. The LLM can now run and self-correct its scripts using a new `run_script` tool before finalizing the save. --- src/agent/Agent.zig | 368 ++++++++++++++++++++++++++++++++--------- src/agent/Spinner.zig | 37 ++++- src/agent/Terminal.zig | 13 +- src/browser/tools.zig | 95 +++++++---- src/script/Runtime.zig | 159 ++++++++++++++---- src/string.zig | 49 ++++++ 6 files changed, 574 insertions(+), 147 deletions(-) diff --git a/src/agent/Agent.zig b/src/agent/Agent.zig index 0f2ba086..4d138cc1 100644 --- a/src/agent/Agent.zig +++ b/src/agent/Agent.zig @@ -110,6 +110,14 @@ node_registry: CDPNode.Registry, terminal: Terminal, save_buffer: Recorder, save_path: ?[]u8, +/// Backs `last_extract_json`; reset alongside `save_buffer`. +last_extract_arena: std.heap.ArenaAllocator, +/// The JSON the most recent successful `extract` returned this session — the +/// real data `/save` grounds and verifies its synthesized script against. +last_extract_json: ?[]const u8 = null, +/// Set for the duration of an LLM `/save` so the `run_script` tool can reach +/// the dry-run runtime it executes candidates on. +active_verify: ?*Verify = null, script_runtime_mutex: std.Thread.Mutex = .{}, active_script_runtime: ?*ScriptRuntime = null, conversation: Conversation, @@ -254,6 +262,7 @@ pub fn init(allocator: std.mem.Allocator, app: *App, opts: Config.Agent) !*Agent .terminal = .init(allocator, history_paths, verbosity, will_repl), .save_buffer = .init(allocator), .save_path = null, + .last_extract_arena = .init(allocator), .conversation = .init(allocator, opts.system_prompt orelse default_system_prompt), .model = model, .effort = effort, @@ -294,6 +303,7 @@ pub fn init(allocator: std.mem.Allocator, app: *App, opts: Config.Agent) !*Agent pub fn deinit(self: *Agent) void { self.terminal.uninstallLogSink(); self.save_buffer.deinit(); + self.last_extract_arena.deinit(); if (self.save_path) |p| self.allocator.free(p); self.terminal.deinit(); self.conversation.deinit(); @@ -630,11 +640,19 @@ fn handleUsage(self: *Agent) void { /// node IDs. Shared by `/clear` and `/reset`. fn clearConversation(self: *Agent) void { self.conversation.rollback(0); - self.save_buffer.reset(); + self.resetSaveBuffers(); self.total_usage = .{}; self.node_registry.reset(); } +/// Drop everything `/save` accumulates: the recorded action buffer and the +/// captured extract data that grounds synthesis. +fn resetSaveBuffers(self: *Agent) void { + self.save_buffer.reset(); + _ = self.last_extract_arena.reset(.retain_capacity); + self.last_extract_json = null; +} + /// Forget the conversation while leaving the browser session live — loaded page /// stays put, cookies/logins preserved. fn handleClear(self: *Agent) void { @@ -862,7 +880,7 @@ fn handleSave(self: *Agent, arena: std.mem.Allocator, rest: []const u8) void { new_save_path = null; } const saved_lines = self.save_buffer.lines; - self.save_buffer.reset(); + self.resetSaveBuffers(); self.terminal.printInfo("Saved {d} line(s) to {s}", .{ saved_lines, self.save_path.? }); } @@ -958,10 +976,130 @@ fn abortSave(self: *Agent, baseline: usize, reason: []const u8) void { self.failSave(reason); } -/// LLM-synthesized `/save`: hand the model the builtin catalog, the full -/// conversation, and the deterministic record of what ran, then write the -/// idiomatic script it returns. +/// In-flight `/save` verification harness: the dry-run runtime the `run_script` +/// tool executes candidates on, plus the last source it ran (a fallback script +/// if the model finishes the loop without re-emitting it as text). +const Verify = struct { + runtime: *ScriptRuntime, + last_source: ?[]const u8 = null, +}; + +/// Agent-only addendum (kept out of the shared `save_synthesis_prompt`) telling +/// the model to derive every value at runtime and check the result with run_script. +const save_verify_addendum = + \\Read data with the recorded extract(...), not evaluate() — extract can read a + \\card's whole text via an empty selector (""). Reshape its result in plain JS so the + \\completion value matches the schema exactly (same keys, parsed numbers); don't + \\return the raw extract or hard-code values. + \\Before finalizing, test with run_script: it runs your FULL script for real from a + \\blank page, so it must goto(...) first (missing goto → "no page loaded", a wrong + \\selector → null). Confirm every field is populated, then reply with ONLY the final + \\JavaScript source. +; + +/// Cap on the captured extract sample shown in the synthesis prompt (the full +/// data still feeds the dry run); keeps a large result from dominating context. +const save_sample_cap = 8 * 1024; + +/// LLM-synthesized `/save`. Pin the output shape first — derive the session's +/// intent, then a typed output schema from it — so the script's result shape is +/// stable across runs, then synthesize the script honoring that schema. Each +/// step degrades gracefully: a null schema falls back to plain synthesis. fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8, prompt: ?[]const u8) void { + self.conversation.ensureSystemPrompt() catch return self.failSave("out of memory"); + const baseline = self.conversation.messages.items.len; + + const anchor = prompt orelse self.one_shot_task; + const schema = self.deriveOutputSchema(arena, baseline, anchor); + if (self.cancel_requested.load(.acquire)) { + self.resetAfterCancel(baseline); + return; + } + + self.synthesizeScript(arena, filename, prompt, schema); +} + +/// Steps 1–2 of `/save`: intent (over the session) → typed output schema. Both +/// turns leave the conversation as they found it; returns null if either turn +/// produced nothing usable (the caller then synthesizes without a schema). +fn deriveOutputSchema(self: *Agent, arena: std.mem.Allocator, baseline: usize, anchor: ?[]const u8) ?[]const u8 { + const intent = self.deriveIntent(arena, baseline, anchor) orelse return null; + if (self.cancel_requested.load(.acquire)) return null; + return self.deriveSchema(arena, intent); +} + +/// One-sentence intent from the session turns. Runs over the live conversation +/// (so the model sees the session) but rolls back to `baseline`, keeping the +/// turn out of history. An explicit anchor is folded in as authoritative. +fn deriveIntent(self: *Agent, arena: std.mem.Allocator, baseline: usize, anchor: ?[]const u8) ?[]const u8 { + const ma = self.conversation.arena.allocator(); + var out: std.Io.Writer.Allocating = .init(ma); + out.writer.writeAll(browser_tools.save_intent_prompt) catch return null; + if (anchor) |a| { + out.writer.print("\nThe user described the goal as: {s}\nTreat that as authoritative and reconcile it with the session.", .{a}) catch return null; + } + self.conversation.messages.append(self.allocator, .{ .role = .user, .content = out.written() }) catch return null; + defer self.conversation.rollback(baseline); + return self.runTextTurn(&self.conversation.messages, arena, self.allocator, ma, 512, "understanding the task"); +} + +/// Typed output schema from the intent. Runs over a throwaway message list — +/// not the conversation — so the schema is derived from the logical intent +/// alone, blind to the page structure and how the data was fetched. +fn deriveSchema(self: *Agent, arena: std.mem.Allocator, intent: []const u8) ?[]const u8 { + var msgs: std.ArrayList(zenai.provider.Message) = .empty; + const msg = std.fmt.allocPrint(arena, "{s} {s}", .{ browser_tools.save_schema_prompt, intent }) catch return null; + msgs.append(arena, .{ .role = .user, .content = msg }) catch return null; + const raw = self.runTextTurn(&msgs, arena, arena, arena, 1024, "designing the output schema") orelse return null; + return string.stripCodeFence(raw); +} + +/// Run a single no-tools text turn over `messages` and return the model's text +/// duped into `dest` (so it survives any rollback of `messages`), or null on +/// cancel, error, or empty output. Shared by the intent and schema steps. +fn runTextTurn( + self: *Agent, + messages: *std.ArrayList(zenai.provider.Message), + dest: std.mem.Allocator, + list_alloc: std.mem.Allocator, + data_alloc: std.mem.Allocator, + max_tokens: i32, + status: []const u8, +) ?[]const u8 { + self.terminal.spinner.start(); + self.terminal.spinner.setStatus(status); + var result = self.ai_client.?.runTools( + self.model, + messages, + list_alloc, + data_alloc, + .{ .context = @ptrCast(self), .callFn = handleToolCall }, + .{ + .tools = &.{}, + .max_turns = 1, + .max_tokens = max_tokens, + .tool_choice = .none, + .effort = .low, + .cancel = .{ .context = @ptrCast(self), .checkFn = checkCancel }, + }, + ) catch |err| { + self.terminal.spinner.cancel(); + if (!self.cancel_requested.load(.acquire)) log.err(.app, "AI save schema turn error", .{ .err = err }); + return null; + }; + self.terminal.spinner.stop(); + defer result.deinit(); + self.total_usage.add(result.usage); + if (result.cancelled) return null; + const text = std.mem.trim(u8, result.text orelse return null, &std.ascii.whitespace); + if (text.len == 0) return null; + return dest.dupe(u8, text) catch null; +} + +/// Step 3 of `/save`: hand the model the builtin catalog, the full conversation, +/// the deterministic record of what ran, and the required output schema, then +/// write the idiomatic script it returns. +fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8, prompt: ?[]const u8, schema: ?[]const u8) void { const provider_client = self.ai_client.?; const resolved = self.resolveSavePathAndMode(arena, filename) orelse return; @@ -972,10 +1110,41 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8, const ma = self.conversation.arena.allocator(); const baseline = self.conversation.messages.items.len; - const user_msg = self.buildSaveSynthesisMessage(ma, prompt) catch return self.failSave("out of memory"); + // When the session captured extract data, let the model test candidates on + // it via `run_script`; otherwise fall back to a single no-tools synthesis. + var verify: Verify = .{ .runtime = undefined }; + var run_tools: [1]ProviderTool = undefined; + const verifying = blk: { + // Gate on a captured extract: it means the session loaded the page and + // left it in a state worth verifying against (and gives a prompt sample). + if (self.last_extract_json == null) break :blk false; + run_tools[0] = browser_tools.runScriptToolDef(ma) catch break :blk false; + const runtime = ScriptRuntime.init(self.allocator, self.browser.app, self.session, &self.node_registry) catch break :blk false; + verify.runtime = runtime; + self.active_verify = &verify; + self.script_runtime_mutex.lock(); + self.active_script_runtime = runtime; + self.script_runtime_mutex.unlock(); + break :blk true; + }; + defer if (verifying) { + self.script_runtime_mutex.lock(); + self.active_script_runtime = null; + self.script_runtime_mutex.unlock(); + self.active_verify = null; + verify.runtime.cancelTerminate(); + verify.runtime.deinit(); + }; + + const sample: ?[]const u8 = if (verifying) blk: { + const d = self.last_extract_json.?; + break :blk d[0..@min(d.len, save_sample_cap)]; + } else null; + const user_msg = self.buildSaveSynthesisMessage(ma, prompt, schema, sample) catch return self.failSave("out of memory"); self.conversation.messages.append(self.allocator, .{ .role = .user, .content = user_msg }) catch return self.failSave("out of memory"); self.terminal.spinner.start(); + self.terminal.spinner.setStatus(if (verifying) "writing and testing the script" else "writing the script"); var result = provider_client.runTools( self.model, &self.conversation.messages, @@ -983,10 +1152,10 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8, ma, .{ .context = @ptrCast(self), .callFn = handleToolCall }, .{ - .tools = &.{}, - .max_turns = 1, + .tools = if (verifying) run_tools[0..1] else &.{}, + .max_turns = if (verifying) 6 else 1, .max_tokens = 8192, - .tool_choice = .none, + .tool_choice = if (verifying) .auto else .none, .effort = .medium, .cancel = .{ .context = @ptrCast(self), .checkFn = checkCancel }, }, @@ -1008,12 +1177,23 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8, return; } - const raw = result.text orelse return self.abortSave(baseline, "the model returned no script"); + // Prefer the last candidate that ran cleanly — it's verified, pure JS, with + // none of the commentary the model sometimes wraps its final message in. Fall + // back to the final text only when nothing ran (no extract data, or it never + // called run_script). + const raw: []const u8 = blk: { + if (verifying) { + if (verify.last_source) |s| break :blk s; + } + if (result.text) |t| { + if (std.mem.trim(u8, t, &std.ascii.whitespace).len > 0) break :blk t; + } + return self.abortSave(baseline, "the model returned no script"); + }; - // `result.text` lives in the conversation arena, freed by the rollback - // below; copy into the command arena first (scrubbing may return its input - // as-is). - const owned = arena.dupe(u8, stripCodeFence(raw)) catch return self.abortSave(baseline, "out of memory"); + // `raw` lives in the conversation arena, freed by the rollback below; copy + // into the command arena first (scrubbing may return its input as-is). + const owned = arena.dupe(u8, string.stripCodeFence(raw)) catch return self.abortSave(baseline, "out of memory"); const script = browser_tools.reverseSubstituteEnvVars(arena, owned) catch return self.abortSave(baseline, "out of memory"); // The save turn is a meta-action; keep it out of the ongoing conversation. @@ -1025,10 +1205,53 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8, }; self.rememberSavePath(path); - self.save_buffer.reset(); + self.resetSaveBuffers(); self.terminal.printInfo("Saved synthesized script to {s}", .{path}); } +/// `run_script` tool handler: execute `source` on the dry-run runtime and hand +/// the model back the completion value (or the error), so it can judge and fix +/// its own script against real data. +fn runScriptTool(self: *Agent, allocator: std.mem.Allocator, arguments: ?std.json.Value) zenai.provider.Client.ToolHandler.Result { + const verify = self.active_verify.?; + const args = browser_tools.parseArgsOrDefault(struct { source: []const u8 = "" }, allocator, arguments) catch + return .{ .content = "invalid run_script arguments", .is_error = true }; + const source = args.source; + if (source.len == 0) return .{ .content = "run_script requires a non-empty \"source\" string", .is_error = true }; + + // Start each candidate from a blank page, exactly like a standalone replay — + // so a script that forgets to goto(...) fails here instead of silently relying + // on the page the session left loaded. + if (self.session.hasPage()) self.session.removePage(); + + const outcome = verify.runtime.runSourceCapture(source, "candidate.js") catch + return .{ .content = "out of memory running candidate", .is_error = true }; + if (outcome.err) |e| { + self.terminal.agentVerifyRun(oneLinePreview(allocator, e, 120), false); + return .{ .content = std.fmt.allocPrint(allocator, "Script threw: {s}", .{e}) catch "Script threw an error", .is_error = true }; + } + + // Keep the last source that ran cleanly — it's the verified, prose-free + // artifact `synthesizeScript` saves, instead of the model's final message + // (which may wrap the script in commentary). + verify.last_source = self.conversation.arena.allocator().dupe(u8, source) catch source; + + const body = if (outcome.output.len == 0) "(completion value is empty/undefined)" else outcome.output; + self.terminal.agentVerifyRun(oneLinePreview(allocator, body, 120), true); + const content = std.fmt.allocPrint(allocator, "Completion value:\n{s}", .{body}) catch body; + return .{ .content = string.truncateWithMarker(allocator, content, tool_output_max_bytes), .is_error = false }; +} + +/// Collapse `text` to a single trimmed line capped at `max` cells (with an +/// ellipsis when cut) — a compact preview for the verify-run trace bullet. +fn oneLinePreview(arena: std.mem.Allocator, text: []const u8, max: usize) []const u8 { + const trimmed = std.mem.trim(u8, text, &std.ascii.whitespace); + const first = trimmed[0 .. std.mem.indexOfScalar(u8, trimmed, '\n') orelse trimmed.len]; + if (first.len <= max) return first; + const cut = string.truncateUtf8(first, max); + return std.fmt.allocPrint(arena, "{s}…", .{cut}) catch cut; +} + /// Persist `path` as the destination reused by a subsequent bare `/save`. fn rememberSavePath(self: *Agent, path: []const u8) void { if (self.save_path) |old| { @@ -1039,17 +1262,27 @@ fn rememberSavePath(self: *Agent, path: []const u8) void { self.save_path = dup; } -fn buildSaveSynthesisMessage(self: *Agent, arena: std.mem.Allocator, prompt: ?[]const u8) ![]const u8 { +fn buildSaveSynthesisMessage(self: *Agent, arena: std.mem.Allocator, prompt: ?[]const u8, schema: ?[]const u8, sample: ?[]const u8) ![]const u8 { var out: std.Io.Writer.Allocating = .init(arena); const w = &out.writer; try w.writeAll(browser_tools.save_synthesis_prompt); - try w.writeAll("\n\nBuiltin functions to prefer (call them as JS functions):\n"); + try w.writeAll("\n\nBuiltin functions (call them as JS functions). extract is the main way to read data — use it for every value you need; the rest navigate or act on the page:\n"); try renderBuiltinCatalog(w); const recorded = self.save_buffer.bytes(); if (recorded.len > 0) { try w.writeAll("\nCommands and JS that actually ran this session:\n"); try w.writeAll(recorded); } + if (schema) |s| { + try w.writeAll("\nThe completion value must match this output schema (types are examples):\n"); + try w.writeAll(s); + } + if (sample) |data| { + try w.writeAll("\nWhat a recorded extract returned this session, for reference:\n"); + try w.writeAll(data); + try w.writeAll("\n\n"); + try w.writeAll(save_verify_addendum); + } if (prompt) |p| { try w.writeAll("\nThe user's instruction for this script:\n"); try w.writeAll(p); @@ -1062,26 +1295,26 @@ fn buildSaveSynthesisMessage(self: *Agent, arena: std.mem.Allocator, prompt: ?[] /// dialect (e.g. `extract`'s schema format) without the tool schemas a no-tools /// synthesis turn omits. fn renderBuiltinCatalog(w: *std.Io.Writer) !void { + // The primary builtins first; `evaluate` is held back and framed as a last + // resort below, so it isn't presented as a peer way to read data. for (Schema.all()) |s| { - if (!s.tool.isRecorded()) continue; - try w.print("\n{s}(", .{s.tool_name}); - for (s.required, 0..) |req, i| { - if (i != 0) try w.writeAll(", "); - try w.writeAll(req); - } - try w.print("):\n{s}\n", .{s.description}); + if (!s.tool.isRecorded() or s.tool == .evaluate) continue; + try renderBuiltinEntry(w, s); + } + for (Schema.all()) |s| { + if (s.tool != .evaluate) continue; + try w.writeAll("\nEscape hatch for advanced page interaction or page-side logic no builtin above can express — not for reading data extract can read:\n"); + try renderBuiltinEntry(w, s); } } -/// Strip a surrounding ```` ```lang … ``` ```` markdown fence if the model -/// wrapped its output in one despite being told not to. -fn stripCodeFence(text: []const u8) []const u8 { - const t = std.mem.trim(u8, text, &std.ascii.whitespace); - if (!std.mem.startsWith(u8, t, "```")) return t; - const first_nl = std.mem.indexOfScalar(u8, t, '\n') orelse return t; - const body = t[first_nl + 1 ..]; - const close = std.mem.lastIndexOf(u8, body, "```") orelse return std.mem.trim(u8, body, &std.ascii.whitespace); - return std.mem.trim(u8, body[0..close], &std.ascii.whitespace); +fn renderBuiltinEntry(w: *std.Io.Writer, s: Schema) !void { + try w.print("\n{s}(", .{s.tool_name}); + for (s.required, 0..) |req, i| { + if (i != 0) try w.writeAll(", "); + try w.writeAll(req); + } + try w.print("):\n{s}\n", .{s.description}); } fn logSaveBufferError(self: *Agent, err: anyerror) void { @@ -1309,9 +1542,9 @@ fn recordSlashToolCall( .arguments = if (args) |v| try zenai.json.dupeValue(ma, v) else null, }; - // capToolOutput returns its input unchanged under the cap; dupe so content - // doesn't alias the caller's per-iteration arena. - const capped = capToolOutput(ma, result.text); + // truncateWithMarker returns its input unchanged under the cap; dupe so + // content doesn't alias the caller's per-iteration arena. + const capped = string.truncateWithMarker(ma, result.text, tool_output_max_bytes); const content = if (capped.ptr == result.text.ptr) try ma.dupe(u8, capped) else capped; const tool_results = try ma.alloc(zenai.provider.ToolResult, 1); @@ -1415,6 +1648,13 @@ fn processUserMessage(self: *Agent, input: TurnInput) !?[]const u8 { if (!tc.is_error and t == .extract) last_extract_idx = i; } + // Keep the latest extract's real result so `/save` can ground and + // verify its synthesized post-processing against actual data. + if (last_extract_idx) |idx| { + _ = self.last_extract_arena.reset(.retain_capacity); + self.last_extract_json = self.last_extract_arena.allocator().dupe(u8, result.tool_calls_made[idx].result) catch null; + } + var recorded_any = false; for (result.tool_calls_made, 0..) |tc, i| { if (tc.is_error) continue; @@ -1546,19 +1786,14 @@ fn buildUserMessageParts( // the next request body) without bound. const tool_output_max_bytes: usize = 1 * 1024 * 1024; -fn capToolOutput(allocator: std.mem.Allocator, output: []const u8) []const u8 { - if (output.len <= tool_output_max_bytes) return output; - const prefix = string.truncateUtf8(output, tool_output_max_bytes); - var suffix_buf: [64]u8 = undefined; - const suffix = std.fmt.bufPrint(&suffix_buf, "\n...[truncated, original {d} bytes]", .{output.len}) catch return prefix; - return std.mem.concat(allocator, u8, &.{ prefix, suffix }) catch prefix; -} - fn handleToolCall(ctx: *anyopaque, allocator: std.mem.Allocator, tool_name: []const u8, arguments: ?std.json.Value) zenai.provider.Client.ToolHandler.Result { const self: *Agent = @ptrCast(@alignCast(ctx)); + // `run_script`'s only arg is the whole candidate script — too long and noisy + // to render, so suppress it and let the label/phase carry the context. + const is_run_script = self.active_verify != null and std.mem.eql(u8, tool_name, browser_tools.run_script_tool_name); // The spinner doesn't render args, and `agentToolDone` skips the body line // at low verbosity — don't pay for the stringify when nobody reads it. - const needs_args = self.terminal.spinner.isEnabled() or self.terminal.verbosity != .low; + const needs_args = !is_run_script and (self.terminal.spinner.isEnabled() or self.terminal.verbosity != .low); // Stringify the pre-substitution args so $LP_* placeholders the model // emitted stay redacted in the UI. const args_str: []const u8 = if (needs_args) (if (arguments) |v| @@ -1568,12 +1803,15 @@ fn handleToolCall(ctx: *anyopaque, allocator: std.mem.Allocator, tool_name: []co self.terminal.spinner.setTool(tool_name, args_str); defer self.terminal.spinner.setThinking(); - const outcome: zenai.provider.Client.ToolHandler.Result = if (browser_tools.call(allocator, self.session, &self.node_registry, tool_name, arguments)) |result| - .{ .content = capToolOutput(allocator, result.text), .is_error = result.is_error } + const outcome: zenai.provider.Client.ToolHandler.Result = if (is_run_script) + self.runScriptTool(allocator, arguments) + else if (browser_tools.call(allocator, self.session, &self.node_registry, tool_name, arguments)) |result| + .{ .content = string.truncateWithMarker(allocator, result.text, tool_output_max_bytes), .is_error = result.is_error } else |err| .{ .content = std.fmt.allocPrint(allocator, "Error: {s}", .{@errorName(err)}) catch "Error: tool execution failed", .is_error = true }; - self.terminal.agentToolDone(tool_name, args_str, !outcome.is_error); + // run_script emits its own always-visible trace inside `runScriptTool`. + if (!is_run_script) self.terminal.agentToolDone(tool_name, args_str, !outcome.is_error); if (self.terminal.verbosity == .high) self.terminal.printToolOutcome(tool_name, outcome.content, outcome.is_error); return outcome; } @@ -1644,35 +1882,6 @@ fn completionModels(context: *anyopaque, _: std.mem.Allocator) []const []const u return ids; } -test "capToolOutput: passes through when under cap" { - const ta = std.testing.allocator; - const out = capToolOutput(ta, "short"); - try std.testing.expectEqualStrings("short", out); -} - -// Boundary correctness lives in string.zig's `truncateUtf8` tests; here we only -// assert the agent-specific policy: an over-cap body keeps valid UTF-8 and gains -// the truncation marker. -test "capToolOutput: appends a marker when truncating" { - const ta = std.testing.allocator; - - // 3-byte Hangul codepoint (U+D55C '한' = 0xED 0x95 0x9C) straddling the cap. - const cap = tool_output_max_bytes; - const buf = try ta.alloc(u8, cap + 8); - defer ta.free(buf); - @memset(buf[0 .. cap - 1], 'a'); - buf[cap - 1] = 0xED; - buf[cap + 0] = 0x95; - buf[cap + 1] = 0x9C; - @memset(buf[cap + 2 ..], 'b'); - - const out = capToolOutput(ta, buf); - defer if (out.ptr != buf.ptr) ta.free(out); - - try std.testing.expect(std.unicode.utf8ValidateSlice(out)); - try std.testing.expect(std.mem.indexOf(u8, out, "truncated") != null); -} - test "parseSaveCommand: filename only" { const r = try parseSaveCommand("out.js"); try std.testing.expectEqualStrings("out.js", r.filename.?); @@ -1721,8 +1930,3 @@ test "renderBuiltinCatalog: lists recorded tools, omits read-only ones" { try std.testing.expect(std.mem.indexOf(u8, text, "tree(") == null); try std.testing.expect(std.mem.indexOf(u8, text, "markdown(") == null); } - -test "stripCodeFence: unwraps a fenced block and passes plain text through" { - try std.testing.expectEqualStrings("goto(\"x\");", stripCodeFence("```js\ngoto(\"x\");\n```")); - try std.testing.expectEqualStrings("goto(\"x\");", stripCodeFence("goto(\"x\");")); -} diff --git a/src/agent/Spinner.zig b/src/agent/Spinner.zig index 36d91381..6e119b15 100644 --- a/src/agent/Spinner.zig +++ b/src/agent/Spinner.zig @@ -71,6 +71,12 @@ cv: std.Thread.Condition = .{}, state: State = .idle, frame: u8 = 0, +/// Custom label for the thinking state — a phase like "writing the script". +/// Empty falls back to "thinking". Set via `setStatus`, cleared when the turn +/// ends so the next turn starts plain. +status_buf: [48]u8 = undefined, +status_len: usize = 0, + tool_calls: u32 = 0, turn_started_ns: i128 = 0, @@ -143,6 +149,7 @@ pub fn stop(self: *Spinner) void { _ = std.posix.write(std.posix.STDERR_FILENO, summary) catch {}; self.state = .idle; + self.status_len = 0; self.last_render_len = 0; } @@ -155,6 +162,7 @@ pub fn cancel(self: *Spinner) void { if (self.state == .idle) return; _ = std.posix.write(std.posix.STDERR_FILENO, "\r" ++ clear_eol) catch {}; self.state = .idle; + self.status_len = 0; self.last_render_len = 0; } @@ -188,6 +196,22 @@ pub fn setTool(self: *Spinner, name: []const u8, args: []const u8) void { self.cv.signal(); } +/// Label the thinking indicator with the current phase (e.g. "writing the +/// script"). Stored even while a tool label is up or before `start()`, so it +/// shows the moment the indicator next renders thinking. Cleared at turn end. +pub fn setStatus(self: *Spinner, text: []const u8) void { + if (!self.isEnabled()) return; + self.mu.lock(); + defer self.mu.unlock(); + const t = truncateUtf8(text, self.status_buf.len); + @memcpy(self.status_buf[0..t.len], t); + self.status_len = t.len; + if (self.state == .thinking) { + self.renderLocked(); + self.cv.signal(); + } +} + /// Request a transition back to the cycling "thinking" state. The worker /// honors `min_tool_display_ns`: if the current tool label has not been up /// long enough, the flip is deferred until it has. @@ -253,11 +277,14 @@ fn renderLocked(self: *Spinner) void { const glyph = braille[self.frame % braille.len]; const written = switch (self.state) { .idle => return, - .thinking => std.fmt.bufPrint( - &buf, - "\r" ++ ansi.yellow ++ "{s}" ++ ansi.reset ++ " " ++ ansi.dim ++ "[agent: thinking]" ++ ansi.reset ++ clear_eol, - .{glyph}, - ) catch return, + .thinking => blk: { + const label = if (self.status_len > 0) self.status_buf[0..self.status_len] else "thinking"; + break :blk std.fmt.bufPrint( + &buf, + "\r" ++ ansi.yellow ++ "{s}" ++ ansi.reset ++ " " ++ ansi.dim ++ "[agent: {s}]" ++ ansi.reset ++ clear_eol, + .{ glyph, label }, + ) catch return; + }, .tool => |tool| blk: { const prefix: []const u8 = if (tool.manual) "" else "agent: "; const name = tool.name_buf[0..tool.name_len]; diff --git a/src/agent/Terminal.zig b/src/agent/Terminal.zig index c7341412..40445bb6 100644 --- a/src/agent/Terminal.zig +++ b/src/agent/Terminal.zig @@ -212,9 +212,18 @@ pub fn endTool(self: *Terminal) void { /// text via the bullet character. pub fn agentToolDone(self: *Terminal, name: []const u8, args: []const u8, ok: bool) void { if (!self.verbosity.atLeast(.medium)) return; - const spinner_on = self.spinner.isEnabled(); + self.emitToolBullet(name, args, ok); +} - if (spinner_on) { +/// Trace one `/save` candidate run. Unlike `agentToolDone` this is shown even at +/// the REPL's default `.low` verbosity: the verify loop is an infrequent, +/// user-initiated step the user needs to watch happen. +pub fn agentVerifyRun(self: *Terminal, summary: []const u8, ok: bool) void { + self.emitToolBullet("run_script", summary, ok); +} + +fn emitToolBullet(self: *Terminal, name: []const u8, args: []const u8, ok: bool) void { + if (self.spinner.isEnabled()) { const a = if (self.repl_arena) |*ra| ra else return; defer _ = a.reset(.retain_capacity); const bytes = formatBulletLine(a.allocator(), name, args, ok) catch return; diff --git a/src/browser/tools.zig b/src/browser/tools.zig index 11e358d2..b5420206 100644 --- a/src/browser/tools.zig +++ b/src/browser/tools.zig @@ -144,37 +144,74 @@ pub const driver_guidance = /// Shared: the agent's `/save` feeds it to its own LLM; the MCP `save` tool /// hands it to the driving client as the tool description. pub const save_synthesis_prompt = - \\Write a single Lightpanda agent script (.js) that reproduces what the user - \\set out to do this session. Infer the goal from the whole conversation and - \\keep only the steps a clean, repeatable script needs — drop failed attempts, - \\retries, exploratory reads (tree/markdown/extract probes), and corrections. - \\Pick the right layer for each step: - \\- builtins (goto, click, fill, extract, …) for actions and for reading data; - \\ extract is how you pull structured data out of a page. - \\- plain top-level JavaScript for logic — loops, cross-page aggregation, - \\ filtering, string building. It runs in the script, not the page. - \\- evaluate(...) only for page-side JavaScript no builtin can express. It is - \\ an escape hatch, not a default, and cannot see the script's variables — - \\ interpolate any value into its string. - \\Stay faithful to the recorded calls: same options each one actually used. - \\Do NOT add a `timeout` to goto (or any tool) unless the session - \\did. Never round-trip a result through `lp.*`, and never append no-op - \\extract(...) probes or `evaluate("return lp....")` tails to surface output. - \\The completion value — the last top-level expression — prints automatically - \\(objects and arrays as JSON), so end with the bare result expression: a final - \\`extract({...});`, or `results;` after an aggregation loop. No console.log, - \\JSON.stringify, or `return` (illegal at top level) needed. - \\Write modern, readable JavaScript: `for (const x of xs)`, `const`/`let` over - \\`var`, template literals, destructuring, 2-space indent (including multi-line - \\extract({...}) schemas). - \\The script runs as a classic script, so top-level `await` is a syntax error. - \\The builtins are synchronous — each returns its result directly, so never - \\wrap them in async/await, .then, or Promises (`const data = extract(...)`, - \\not `await extract(...)`). evaluate(...) may run async JS in the page, but - \\the call itself returns synchronously. - \\Output ONLY JavaScript source — no markdown fences, no commentary. + \\Write a single Lightpanda agent script (.js) that reproduces what the user set + \\out to do this session. Keep only the steps a clean, repeatable script needs — + \\drop failed attempts, retries, and exploratory probes. + \\Use the builtins for actions and data — extract is the main way to read data — + \\and plain top-level JavaScript for logic (loops, aggregation, filtering, string + \\work). Reserve evaluate(...) for advanced page interaction or page-side logic no + \\builtin can express; it can't see the script's variables, so interpolate them into + \\its string. Stay faithful to the recorded calls and their options (e.g. don't add + \\a timeout the session didn't use). + \\The last top-level expression prints automatically (objects/arrays as JSON), so + \\end with the bare result — a final extract({...}); or results; after a loop. No + \\console.log, JSON.stringify, or return. + \\Top-level await is a syntax error and the builtins are synchronous, so never await + \\them (const data = extract(...), not await extract(...)). + \\If an output schema is given below, the completion value MUST match it — parse or + \\split the extracted text as needed. + \\Write modern, readable JavaScript (const/let, template literals, destructuring, + \\2-space indent). Output ONLY JavaScript — no markdown fences, no commentary. ; +/// Agent `/save`, step 1: distill the session into a one-sentence intent that +/// feeds `save_schema_prompt`. Appended to the full conversation, so the model +/// reads the turns; an explicit anchor (a `--task` or `/save` prompt) is added +/// by the caller when present. +pub const save_intent_prompt = + \\In one sentence, state what the user set out to accomplish this session: + \\the goal and the data or outcome they wanted — not the individual steps, + \\tools, selectors, or page structure. Phrase it as a task description, e.g. + \\"Go to HackerNews and retrieve the top 10 stories with their last 3 + \\comments (author and text)". Output ONLY that one sentence. +; + +/// Agent `/save`, step 2: turn the step-1 intent into a logical output schema. +/// Given ONLY the intent — deliberately blind to the page and how data was +/// fetched — so the resulting shape is stable across runs of the same session. +pub const save_schema_prompt = + \\Generate the JSON output schema describing the following intent. Do not + \\focus on the intent context (the actual webpage structure or how to + \\retrieve the data), just the logical JSON schema example. Do not provide + \\actual data, just data types ("string", "number", "boolean"). + \\Example: the intent "Go to HackerNews and retrieve the top 10 stories with + \\their last 3 comments with author and text" produces: + \\{"results": [{"title": "string", "url": "string", "comments": [{"author": "string", "text": "string"}]}]} + \\Output ONLY the JSON schema, no markdown fences, no commentary. + \\ + \\Intent: +; + +/// Name of the agent `/save` verification tool — the model calls it to run a +/// candidate script before finalizing. The agent dispatches it by this name. +pub const run_script_tool_name = "run_script"; + +const run_script_tool_desc = + "Run your full candidate script for real, from a blank page, and return its completion value " ++ + "(or error) — exactly as it will run when saved and replayed. It must navigate itself with " ++ + "goto(...). Use it to verify navigation, extraction, and your transform before finalizing."; + +const run_script_params_json = + \\{"type":"object","properties":{"source":{"type":"string","description":"Full JavaScript source of the candidate script to execute."}},"required":["source"]} +; + +/// The `run_script` tool definition for an LLM `/save` synthesis turn. `arena` +/// backs the parsed parameter schema, so it must outlive the `runTools` call. +pub fn runScriptToolDef(arena: std.mem.Allocator) !zenai.provider.Tool { + const params = try std.json.parseFromSliceLeaky(std.json.Value, arena, run_script_params_json, .{}); + return .{ .name = run_script_tool_name, .description = run_script_tool_desc, .parameters = params }; +} + /// Reject paths that an untrusted MCP client could use to escape the /// working directory: empty paths, absolute paths, and any path with a /// `..` segment. Operator-controlled symlinks already inside CWD are out diff --git a/src/script/Runtime.zig b/src/script/Runtime.zig index 3d8841fc..74600430 100644 --- a/src/script/Runtime.zig +++ b/src/script/Runtime.zig @@ -95,6 +95,13 @@ pub const RunError = error{ OutOfMemory, }; +/// A captured script run: `err` is the formatted failure (null on success); +/// `output` is the completion value's display string (empty when void). +pub const RunOutcome = struct { + err: ?[]const u8 = null, + output: []const u8 = "", +}; + pub fn init( allocator: std.mem.Allocator, app: *lp.App, @@ -238,6 +245,23 @@ fn setObjectProperty( /// compile/runtime exception returns a formatted error allocated in this /// runtime's call arena and valid until deinit or the next run. pub fn runSource(self: *Runtime, source: []const u8, name: []const u8) RunError!?[]const u8 { + return (try self.runInner(source, name, false)).err; +} + +/// Like `runSource`, but capture the completion value's display string instead +/// of printing it (used by `/save` verification to feed output back to the LLM). +/// Both fields live in the call arena — valid until the next run or deinit. +/// +/// Each call gets a fresh global context so a prior candidate's top-level +/// `const`/`let` doesn't collide ("Identifier 'x' has already been declared") +/// — verification candidates are independent runs, unlike a `/load` script. +pub fn runSourceCapture(self: *Runtime, source: []const u8, name: []const u8) RunError!RunOutcome { + self.resetContext(); + self.createContext() catch return .{ .err = try self.dupeError("script context reset failed") }; + return self.runInner(source, name, true); +} + +fn runInner(self: *Runtime, source: []const u8, name: []const u8, capture: bool) RunError!RunOutcome { _ = self.call_arena.reset(.retain_capacity); var hs: lp.js.HandleScope = undefined; @@ -245,7 +269,7 @@ pub fn runSource(self: *Runtime, source: []const u8, name: []const u8) RunError! defer hs.deinit(); const context: *const v8.Context = @ptrCast(v8.v8__Global__Get(&self.context, self.env.isolate.handle) orelse - return try self.dupeError("agent script context is not available")); + return .{ .err = try self.dupeError("agent script context is not available") }); v8.v8__Context__Enter(context); defer v8.v8__Context__Exit(context); @@ -268,19 +292,27 @@ pub fn runSource(self: *Runtime, source: []const u8, name: []const u8) RunError! &compiler_source, v8.kNoCompileOptions, v8.kNoCacheNoReason, - ) orelse return try self.formatCaught(context, &try_catch, "compile failed"); + ) orelse return .{ .err = try self.formatCaught(context, &try_catch, "compile failed") }; const completion = v8.v8__Script__Run(script, context) orelse - return try self.formatCaught(context, &try_catch, "script failed"); + return .{ .err = try self.formatCaught(context, &try_catch, "script failed") }; // Explicit microtask policy: promise continuations only run once drained. self.env.performIsolateMicrotasks(); if (v8.v8__TryCatch__HasCaught(&try_catch)) { - return try self.formatCaught(context, &try_catch, "script failed"); + return .{ .err = try self.formatCaught(context, &try_catch, "script failed") }; } + if (capture) { + if (v8.v8__Value__IsUndefined(completion)) return .{}; + const output = self.displayString(self.call_arena.allocator(), context, completion) catch |err| switch (err) { + error.OutOfMemory => return error.OutOfMemory, + error.JsException => return .{ .output = "" }, + }; + return .{ .output = output }; + } self.printCompletion(context, completion); - return null; + return .{}; } /// Echo a script's completion value (its last-evaluated expression) so a script @@ -677,10 +709,10 @@ test "agent script runtime: goto and evaluate dispatch through browser tools" { defer testing.reset(); defer if (testing.test_session.hasPage()) testing.test_session.removePage(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); try runTestScript(runtime, @@ -698,10 +730,10 @@ test "agent script runtime: extract returns a JavaScript object" { defer testing.reset(); defer if (testing.test_session.hasPage()) testing.test_session.removePage(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); try runTestScript(runtime, @@ -752,10 +784,10 @@ test "agent script runtime: extract tolerates list selectors that match nothing" defer testing.reset(); defer if (testing.test_session.hasPage()) testing.test_session.removePage(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); try runTestScript(runtime, @@ -780,10 +812,10 @@ test "agent script runtime: strict-mode scripts can call primitives" { defer testing.reset(); defer if (testing.test_session.hasPage()) testing.test_session.removePage(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); try runTestScript(runtime, @@ -798,10 +830,10 @@ test "agent script runtime: strict-mode scripts can call primitives" { test "agent script runtime: promise microtasks run to completion" { defer testing.reset(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); try runTestScript(runtime, @@ -819,10 +851,10 @@ test "agent script runtime: primitives re-entered from argument callbacks stay i defer testing.reset(); defer if (testing.test_session.hasPage()) testing.test_session.removePage(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); try runTestScript(runtime, @@ -841,10 +873,10 @@ test "agent script runtime: terminate interrupts local JavaScript" { defer testing.reset(); defer if (testing.test_session.hasPage()) testing.test_session.removePage(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); const thread = try std.Thread.spawn(.{}, terminateRuntimeSoon, .{runtime}); @@ -859,10 +891,10 @@ test "agent script runtime: agent variables persist and page globals are isolate defer testing.reset(); defer if (testing.test_session.hasPage()) testing.test_session.removePage(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); try runTestScript(runtime, @@ -884,10 +916,10 @@ test "agent script runtime: page evaluate cannot see agent primitives or binding defer testing.reset(); defer if (testing.test_session.hasPage()) testing.test_session.removePage(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); try runTestScript(runtime, @@ -902,10 +934,10 @@ test "agent script runtime: page evaluate cannot see agent primitives or binding test "agent script runtime: console is available in agent context" { defer testing.reset(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); try runTestScript(runtime, @@ -919,10 +951,10 @@ test "agent script runtime: tool errors throw and stop execution" { defer testing.reset(); defer if (testing.test_session.hasPage()) testing.test_session.removePage(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); const message = (try runtime.runSource( @@ -945,10 +977,10 @@ test "agent script runtime: builtin argument marshalling (positional + options)" defer testing.reset(); defer if (testing.test_session.hasPage()) testing.test_session.removePage(); - var registry = CDPNode.Registry.init(testing.allocator); + var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); - const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, ®istry); + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); defer runtime.deinit(); try runTestScript(runtime, @@ -994,3 +1026,72 @@ test "agent script runtime: builtin argument marshalling (positional + options)" try testing.expect(std.mem.indexOf(u8, message, "invalid arguments") != null); } } + +test "agent script runtime: runSourceCapture runs the full script live and captures completion" { + defer testing.reset(); + defer if (testing.test_session.hasPage()) testing.test_session.removePage(); + + var registry: CDPNode.Registry = .init(testing.allocator); + defer registry.deinit(); + + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); + defer runtime.deinit(); + + // Verification runs the candidate exactly as a standalone replay does — it + // must navigate itself; extract then runs against the page it loaded. + const outcome = try runtime.runSourceCapture( + \\goto("http://localhost:9582/src/browser/tests/mcp_actions.html"); + \\click("#btn"); + \\const data = extract({ label: "#btn" }); + \\data.label; + , "candidate.js"); + try testing.expect(outcome.err == null); + try std.testing.expectEqualStrings("Click Me", outcome.output); +} + +test "agent script runtime: runSourceCapture surfaces a candidate's error" { + defer testing.reset(); + defer if (testing.test_session.hasPage()) testing.test_session.removePage(); + + var registry: CDPNode.Registry = .init(testing.allocator); + defer registry.deinit(); + + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); + defer runtime.deinit(); + + // extract returns an object; treating it as an array throws — the real + // `raw.map is not a function` failure the model must see and fix. + const bad = try runtime.runSourceCapture( + \\goto("http://localhost:9582/src/browser/tests/mcp_actions.html"); + \\const raw = extract({ items: [{ selector: "li" }] }); + \\raw.map(x => x); + , "candidate.js"); + try testing.expect(bad.err != null); +} + +test "agent script runtime: each capture run gets a fresh global scope" { + defer testing.reset(); + defer if (testing.test_session.hasPage()) testing.test_session.removePage(); + + var registry: CDPNode.Registry = .init(testing.allocator); + defer registry.deinit(); + + const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, ®istry); + defer runtime.deinit(); + + // The same top-level `const` in two consecutive candidates must not collide — + // each run starts from a clean context, not the previous run's globals. + const src = + \\goto("http://localhost:9582/src/browser/tests/mcp_actions.html"); + \\const data = extract({ label: "#btn" }); + \\const out = data.label; + \\out; + ; + const first = try runtime.runSourceCapture(src, "candidate.js"); + try testing.expect(first.err == null); + try std.testing.expectEqualStrings("Click Me", first.output); + + const second = try runtime.runSourceCapture(src, "candidate.js"); + try testing.expect(second.err == null); + try std.testing.expectEqualStrings("Click Me", second.output); +} diff --git a/src/string.zig b/src/string.zig index 26961eb0..b420793d 100644 --- a/src/string.zig +++ b/src/string.zig @@ -332,6 +332,29 @@ pub fn truncateUtf8(bytes: []const u8, max_bytes: usize) []const u8 { return bytes[0..i]; } +/// Truncate `text` to at most `max_bytes` on a UTF-8 boundary and, when it +/// overflows, append a marker noting the original length. Returns `text` +/// unchanged when it fits; otherwise allocates the marked result in `allocator` +/// (falling back to the bare prefix if that allocation fails). +pub fn truncateWithMarker(allocator: std.mem.Allocator, text: []const u8, max_bytes: usize) []const u8 { + if (text.len <= max_bytes) return text; + const prefix = truncateUtf8(text, max_bytes); + var suffix_buf: [64]u8 = undefined; + const suffix = std.fmt.bufPrint(&suffix_buf, "\n...[truncated, original {d} bytes]", .{text.len}) catch return prefix; + return std.mem.concat(allocator, u8, &.{ prefix, suffix }) catch prefix; +} + +/// Strip a surrounding ```lang … ``` markdown fence if the text is wrapped in +/// one, returning the inner block; passes already-bare text through unchanged. +pub fn stripCodeFence(text: []const u8) []const u8 { + const t = std.mem.trim(u8, text, &std.ascii.whitespace); + if (!std.mem.startsWith(u8, t, "```")) return t; + const first_nl = std.mem.indexOfScalar(u8, t, '\n') orelse return t; + const body = t[first_nl + 1 ..]; + const close = std.mem.lastIndexOf(u8, body, "```") orelse return std.mem.trim(u8, body, &std.ascii.whitespace); + return std.mem.trim(u8, body[0..close], &std.ascii.whitespace); +} + // Discriminatory type that signals the bridge to use arena instead of call_arena // Use this for strings that need to persist beyond the current call // The caller can unwrap and store just the underlying .str field @@ -378,6 +401,32 @@ test "truncateUtf8" { try testing.expectEqual("\xFFx", truncateUtf8("\xFFx", 2)); } +test "truncateWithMarker" { + const ta = std.testing.allocator; + try std.testing.expectEqualStrings("short", truncateWithMarker(ta, "short", 1024)); + + // Over-cap: a 3-byte Hangul codepoint (U+D55C, 0xED 0x95 0x9C) straddling the + // cap must stay valid UTF-8, and the marker must be appended. + const cap: usize = 1024; + const buf = try ta.alloc(u8, cap + 8); + defer ta.free(buf); + @memset(buf[0 .. cap - 1], 'a'); + buf[cap - 1] = 0xED; + buf[cap + 0] = 0x95; + buf[cap + 1] = 0x9C; + @memset(buf[cap + 2 ..], 'b'); + + const out = truncateWithMarker(ta, buf, cap); + defer if (out.ptr != buf.ptr) ta.free(out); + try std.testing.expect(std.unicode.utf8ValidateSlice(out)); + try std.testing.expect(std.mem.indexOf(u8, out, "truncated") != null); +} + +test "stripCodeFence" { + try std.testing.expectEqualStrings("goto(\"x\");", stripCodeFence("```js\ngoto(\"x\");\n```")); + try std.testing.expectEqualStrings("goto(\"x\");", stripCodeFence("goto(\"x\");")); +} + test "String" { const other_short = try String.init(undefined, "other_short", .{}); const other_long = try String.init(testing.allocator, "other_long" ** 100, .{});