agent: verify synthesized scripts during /save

Introduces a multi-step synthesis process for `/save` that derives a logical JSON output schema and uses a dry-run runtime to verify candidate scripts. The LLM can now run and self-correct its scripts using a new `run_script` tool before finalizing the save.
2026-08-01 10:16:25 -04:00 · 2026-06-09 16:49:50 +02:00
parent b23c6c27bc
commit b141da30ca
6 changed files with 574 additions and 147 deletions
--- a/src/agent/Agent.zig
+++ b/src/agent/Agent.zig
@@ -110,6 +110,14 @@ node_registry: CDPNode.Registry,
 terminal: Terminal,
 save_buffer: Recorder,
 save_path: ?[]u8,
+/// Backs `last_extract_json`; reset alongside `save_buffer`.
+last_extract_arena: std.heap.ArenaAllocator,
+/// The JSON the most recent successful `extract` returned this session — the
+/// real data `/save` grounds and verifies its synthesized script against.
+last_extract_json: ?[]const u8 = null,
+/// Set for the duration of an LLM `/save` so the `run_script` tool can reach
+/// the dry-run runtime it executes candidates on.
+active_verify: ?*Verify = null,
 script_runtime_mutex: std.Thread.Mutex = .{},
 active_script_runtime: ?*ScriptRuntime = null,
 conversation: Conversation,
@@ -254,6 +262,7 @@ pub fn init(allocator: std.mem.Allocator, app: *App, opts: Config.Agent) !*Agent
        .terminal = .init(allocator, history_paths, verbosity, will_repl),
        .save_buffer = .init(allocator),
        .save_path = null,
+        .last_extract_arena = .init(allocator),
        .conversation = .init(allocator, opts.system_prompt orelse default_system_prompt),
        .model = model,
        .effort = effort,
@@ -294,6 +303,7 @@ pub fn init(allocator: std.mem.Allocator, app: *App, opts: Config.Agent) !*Agent
 pub fn deinit(self: *Agent) void {
    self.terminal.uninstallLogSink();
    self.save_buffer.deinit();
+    self.last_extract_arena.deinit();
    if (self.save_path) |p| self.allocator.free(p);
    self.terminal.deinit();
    self.conversation.deinit();
@@ -630,11 +640,19 @@ fn handleUsage(self: *Agent) void {
 /// node IDs. Shared by `/clear` and `/reset`.
 fn clearConversation(self: *Agent) void {
    self.conversation.rollback(0);
-    self.save_buffer.reset();
+    self.resetSaveBuffers();
    self.total_usage = .{};
    self.node_registry.reset();
 }

+/// Drop everything `/save` accumulates: the recorded action buffer and the
+/// captured extract data that grounds synthesis.
+fn resetSaveBuffers(self: *Agent) void {
+    self.save_buffer.reset();
+    _ = self.last_extract_arena.reset(.retain_capacity);
+    self.last_extract_json = null;
+}
+
 /// Forget the conversation while leaving the browser session live — loaded page
 /// stays put, cookies/logins preserved.
 fn handleClear(self: *Agent) void {
@@ -862,7 +880,7 @@ fn handleSave(self: *Agent, arena: std.mem.Allocator, rest: []const u8) void {
        new_save_path = null;
    }
    const saved_lines = self.save_buffer.lines;
-    self.save_buffer.reset();
+    self.resetSaveBuffers();
    self.terminal.printInfo("Saved {d} line(s) to {s}", .{ saved_lines, self.save_path.? });
 }

@@ -958,10 +976,130 @@ fn abortSave(self: *Agent, baseline: usize, reason: []const u8) void {
    self.failSave(reason);
 }

-/// LLM-synthesized `/save`: hand the model the builtin catalog, the full
-/// conversation, and the deterministic record of what ran, then write the
-/// idiomatic script it returns.
+/// In-flight `/save` verification harness: the dry-run runtime the `run_script`
+/// tool executes candidates on, plus the last source it ran (a fallback script
+/// if the model finishes the loop without re-emitting it as text).
+const Verify = struct {
+    runtime: *ScriptRuntime,
+    last_source: ?[]const u8 = null,
+};
+
+/// Agent-only addendum (kept out of the shared `save_synthesis_prompt`) telling
+/// the model to derive every value at runtime and check the result with run_script.
+const save_verify_addendum =
+    \\Read data with the recorded extract(...), not evaluate() — extract can read a
+    \\card's whole text via an empty selector (""). Reshape its result in plain JS so the
+    \\completion value matches the schema exactly (same keys, parsed numbers); don't
+    \\return the raw extract or hard-code values.
+    \\Before finalizing, test with run_script: it runs your FULL script for real from a
+    \\blank page, so it must goto(...) first (missing goto → "no page loaded", a wrong
+    \\selector → null). Confirm every field is populated, then reply with ONLY the final
+    \\JavaScript source.
+;
+
+/// Cap on the captured extract sample shown in the synthesis prompt (the full
+/// data still feeds the dry run); keeps a large result from dominating context.
+const save_sample_cap = 8 * 1024;
+
+/// LLM-synthesized `/save`. Pin the output shape first — derive the session's
+/// intent, then a typed output schema from it — so the script's result shape is
+/// stable across runs, then synthesize the script honoring that schema. Each
+/// step degrades gracefully: a null schema falls back to plain synthesis.
 fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8, prompt: ?[]const u8) void {
+    self.conversation.ensureSystemPrompt() catch return self.failSave("out of memory");
+    const baseline = self.conversation.messages.items.len;
+
+    const anchor = prompt orelse self.one_shot_task;
+    const schema = self.deriveOutputSchema(arena, baseline, anchor);
+    if (self.cancel_requested.load(.acquire)) {
+        self.resetAfterCancel(baseline);
+        return;
+    }
+
+    self.synthesizeScript(arena, filename, prompt, schema);
+}
+
+/// Steps 1–2 of `/save`: intent (over the session) → typed output schema. Both
+/// turns leave the conversation as they found it; returns null if either turn
+/// produced nothing usable (the caller then synthesizes without a schema).
+fn deriveOutputSchema(self: *Agent, arena: std.mem.Allocator, baseline: usize, anchor: ?[]const u8) ?[]const u8 {
+    const intent = self.deriveIntent(arena, baseline, anchor) orelse return null;
+    if (self.cancel_requested.load(.acquire)) return null;
+    return self.deriveSchema(arena, intent);
+}
+
+/// One-sentence intent from the session turns. Runs over the live conversation
+/// (so the model sees the session) but rolls back to `baseline`, keeping the
+/// turn out of history. An explicit anchor is folded in as authoritative.
+fn deriveIntent(self: *Agent, arena: std.mem.Allocator, baseline: usize, anchor: ?[]const u8) ?[]const u8 {
+    const ma = self.conversation.arena.allocator();
+    var out: std.Io.Writer.Allocating = .init(ma);
+    out.writer.writeAll(browser_tools.save_intent_prompt) catch return null;
+    if (anchor) |a| {
+        out.writer.print("\nThe user described the goal as: {s}\nTreat that as authoritative and reconcile it with the session.", .{a}) catch return null;
+    }
+    self.conversation.messages.append(self.allocator, .{ .role = .user, .content = out.written() }) catch return null;
+    defer self.conversation.rollback(baseline);
+    return self.runTextTurn(&self.conversation.messages, arena, self.allocator, ma, 512, "understanding the task");
+}
+
+/// Typed output schema from the intent. Runs over a throwaway message list —
+/// not the conversation — so the schema is derived from the logical intent
+/// alone, blind to the page structure and how the data was fetched.
+fn deriveSchema(self: *Agent, arena: std.mem.Allocator, intent: []const u8) ?[]const u8 {
+    var msgs: std.ArrayList(zenai.provider.Message) = .empty;
+    const msg = std.fmt.allocPrint(arena, "{s} {s}", .{ browser_tools.save_schema_prompt, intent }) catch return null;
+    msgs.append(arena, .{ .role = .user, .content = msg }) catch return null;
+    const raw = self.runTextTurn(&msgs, arena, arena, arena, 1024, "designing the output schema") orelse return null;
+    return string.stripCodeFence(raw);
+}
+
+/// Run a single no-tools text turn over `messages` and return the model's text
+/// duped into `dest` (so it survives any rollback of `messages`), or null on
+/// cancel, error, or empty output. Shared by the intent and schema steps.
+fn runTextTurn(
+    self: *Agent,
+    messages: *std.ArrayList(zenai.provider.Message),
+    dest: std.mem.Allocator,
+    list_alloc: std.mem.Allocator,
+    data_alloc: std.mem.Allocator,
+    max_tokens: i32,
+    status: []const u8,
+) ?[]const u8 {
+    self.terminal.spinner.start();
+    self.terminal.spinner.setStatus(status);
+    var result = self.ai_client.?.runTools(
+        self.model,
+        messages,
+        list_alloc,
+        data_alloc,
+        .{ .context = @ptrCast(self), .callFn = handleToolCall },
+        .{
+            .tools = &.{},
+            .max_turns = 1,
+            .max_tokens = max_tokens,
+            .tool_choice = .none,
+            .effort = .low,
+            .cancel = .{ .context = @ptrCast(self), .checkFn = checkCancel },
+        },
+    ) catch |err| {
+        self.terminal.spinner.cancel();
+        if (!self.cancel_requested.load(.acquire)) log.err(.app, "AI save schema turn error", .{ .err = err });
+        return null;
+    };
+    self.terminal.spinner.stop();
+    defer result.deinit();
+    self.total_usage.add(result.usage);
+    if (result.cancelled) return null;
+    const text = std.mem.trim(u8, result.text orelse return null, &std.ascii.whitespace);
+    if (text.len == 0) return null;
+    return dest.dupe(u8, text) catch null;
+}
+
+/// Step 3 of `/save`: hand the model the builtin catalog, the full conversation,
+/// the deterministic record of what ran, and the required output schema, then
+/// write the idiomatic script it returns.
+fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8, prompt: ?[]const u8, schema: ?[]const u8) void {
    const provider_client = self.ai_client.?;

    const resolved = self.resolveSavePathAndMode(arena, filename) orelse return;
@@ -972,10 +1110,41 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8,
    const ma = self.conversation.arena.allocator();
    const baseline = self.conversation.messages.items.len;

-    const user_msg = self.buildSaveSynthesisMessage(ma, prompt) catch return self.failSave("out of memory");
+    // When the session captured extract data, let the model test candidates on
+    // it via `run_script`; otherwise fall back to a single no-tools synthesis.
+    var verify: Verify = .{ .runtime = undefined };
+    var run_tools: [1]ProviderTool = undefined;
+    const verifying = blk: {
+        // Gate on a captured extract: it means the session loaded the page and
+        // left it in a state worth verifying against (and gives a prompt sample).
+        if (self.last_extract_json == null) break :blk false;
+        run_tools[0] = browser_tools.runScriptToolDef(ma) catch break :blk false;
+        const runtime = ScriptRuntime.init(self.allocator, self.browser.app, self.session, &self.node_registry) catch break :blk false;
+        verify.runtime = runtime;
+        self.active_verify = &verify;
+        self.script_runtime_mutex.lock();
+        self.active_script_runtime = runtime;
+        self.script_runtime_mutex.unlock();
+        break :blk true;
+    };
+    defer if (verifying) {
+        self.script_runtime_mutex.lock();
+        self.active_script_runtime = null;
+        self.script_runtime_mutex.unlock();
+        self.active_verify = null;
+        verify.runtime.cancelTerminate();
+        verify.runtime.deinit();
+    };
+
+    const sample: ?[]const u8 = if (verifying) blk: {
+        const d = self.last_extract_json.?;
+        break :blk d[0..@min(d.len, save_sample_cap)];
+    } else null;
+    const user_msg = self.buildSaveSynthesisMessage(ma, prompt, schema, sample) catch return self.failSave("out of memory");
    self.conversation.messages.append(self.allocator, .{ .role = .user, .content = user_msg }) catch return self.failSave("out of memory");

    self.terminal.spinner.start();
+    self.terminal.spinner.setStatus(if (verifying) "writing and testing the script" else "writing the script");
    var result = provider_client.runTools(
        self.model,
        &self.conversation.messages,
@@ -983,10 +1152,10 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8,
        ma,
        .{ .context = @ptrCast(self), .callFn = handleToolCall },
        .{
-            .tools = &.{},
-            .max_turns = 1,
+            .tools = if (verifying) run_tools[0..1] else &.{},
+            .max_turns = if (verifying) 6 else 1,
            .max_tokens = 8192,
-            .tool_choice = .none,
+            .tool_choice = if (verifying) .auto else .none,
            .effort = .medium,
            .cancel = .{ .context = @ptrCast(self), .checkFn = checkCancel },
        },
@@ -1008,12 +1177,23 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8,
        return;
    }

-    const raw = result.text orelse return self.abortSave(baseline, "the model returned no script");
+    // Prefer the last candidate that ran cleanly — it's verified, pure JS, with
+    // none of the commentary the model sometimes wraps its final message in. Fall
+    // back to the final text only when nothing ran (no extract data, or it never
+    // called run_script).
+    const raw: []const u8 = blk: {
+        if (verifying) {
+            if (verify.last_source) |s| break :blk s;
+        }
+        if (result.text) |t| {
+            if (std.mem.trim(u8, t, &std.ascii.whitespace).len > 0) break :blk t;
+        }
+        return self.abortSave(baseline, "the model returned no script");
+    };

-    // `result.text` lives in the conversation arena, freed by the rollback
-    // below; copy into the command arena first (scrubbing may return its input
-    // as-is).
-    const owned = arena.dupe(u8, stripCodeFence(raw)) catch return self.abortSave(baseline, "out of memory");
+    // `raw` lives in the conversation arena, freed by the rollback below; copy
+    // into the command arena first (scrubbing may return its input as-is).
+    const owned = arena.dupe(u8, string.stripCodeFence(raw)) catch return self.abortSave(baseline, "out of memory");
    const script = browser_tools.reverseSubstituteEnvVars(arena, owned) catch return self.abortSave(baseline, "out of memory");

    // The save turn is a meta-action; keep it out of the ongoing conversation.
@@ -1025,10 +1205,53 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8,
    };

    self.rememberSavePath(path);
-    self.save_buffer.reset();
+    self.resetSaveBuffers();
    self.terminal.printInfo("Saved synthesized script to {s}", .{path});
 }

+/// `run_script` tool handler: execute `source` on the dry-run runtime and hand
+/// the model back the completion value (or the error), so it can judge and fix
+/// its own script against real data.
+fn runScriptTool(self: *Agent, allocator: std.mem.Allocator, arguments: ?std.json.Value) zenai.provider.Client.ToolHandler.Result {
+    const verify = self.active_verify.?;
+    const args = browser_tools.parseArgsOrDefault(struct { source: []const u8 = "" }, allocator, arguments) catch
+        return .{ .content = "invalid run_script arguments", .is_error = true };
+    const source = args.source;
+    if (source.len == 0) return .{ .content = "run_script requires a non-empty \"source\" string", .is_error = true };
+
+    // Start each candidate from a blank page, exactly like a standalone replay —
+    // so a script that forgets to goto(...) fails here instead of silently relying
+    // on the page the session left loaded.
+    if (self.session.hasPage()) self.session.removePage();
+
+    const outcome = verify.runtime.runSourceCapture(source, "candidate.js") catch
+        return .{ .content = "out of memory running candidate", .is_error = true };
+    if (outcome.err) |e| {
+        self.terminal.agentVerifyRun(oneLinePreview(allocator, e, 120), false);
+        return .{ .content = std.fmt.allocPrint(allocator, "Script threw: {s}", .{e}) catch "Script threw an error", .is_error = true };
+    }
+
+    // Keep the last source that ran cleanly — it's the verified, prose-free
+    // artifact `synthesizeScript` saves, instead of the model's final message
+    // (which may wrap the script in commentary).
+    verify.last_source = self.conversation.arena.allocator().dupe(u8, source) catch source;
+
+    const body = if (outcome.output.len == 0) "(completion value is empty/undefined)" else outcome.output;
+    self.terminal.agentVerifyRun(oneLinePreview(allocator, body, 120), true);
+    const content = std.fmt.allocPrint(allocator, "Completion value:\n{s}", .{body}) catch body;
+    return .{ .content = string.truncateWithMarker(allocator, content, tool_output_max_bytes), .is_error = false };
+}
+
+/// Collapse `text` to a single trimmed line capped at `max` cells (with an
+/// ellipsis when cut) — a compact preview for the verify-run trace bullet.
+fn oneLinePreview(arena: std.mem.Allocator, text: []const u8, max: usize) []const u8 {
+    const trimmed = std.mem.trim(u8, text, &std.ascii.whitespace);
+    const first = trimmed[0 .. std.mem.indexOfScalar(u8, trimmed, '\n') orelse trimmed.len];
+    if (first.len <= max) return first;
+    const cut = string.truncateUtf8(first, max);
+    return std.fmt.allocPrint(arena, "{s}…", .{cut}) catch cut;
+}
+
 /// Persist `path` as the destination reused by a subsequent bare `/save`.
 fn rememberSavePath(self: *Agent, path: []const u8) void {
    if (self.save_path) |old| {
@@ -1039,17 +1262,27 @@ fn rememberSavePath(self: *Agent, path: []const u8) void {
    self.save_path = dup;
 }

-fn buildSaveSynthesisMessage(self: *Agent, arena: std.mem.Allocator, prompt: ?[]const u8) ![]const u8 {
+fn buildSaveSynthesisMessage(self: *Agent, arena: std.mem.Allocator, prompt: ?[]const u8, schema: ?[]const u8, sample: ?[]const u8) ![]const u8 {
    var out: std.Io.Writer.Allocating = .init(arena);
    const w = &out.writer;
    try w.writeAll(browser_tools.save_synthesis_prompt);
-    try w.writeAll("\n\nBuiltin functions to prefer (call them as JS functions):\n");
+    try w.writeAll("\n\nBuiltin functions (call them as JS functions). extract is the main way to read data — use it for every value you need; the rest navigate or act on the page:\n");
    try renderBuiltinCatalog(w);
    const recorded = self.save_buffer.bytes();
    if (recorded.len > 0) {
        try w.writeAll("\nCommands and JS that actually ran this session:\n");
        try w.writeAll(recorded);
    }
+    if (schema) |s| {
+        try w.writeAll("\nThe completion value must match this output schema (types are examples):\n");
+        try w.writeAll(s);
+    }
+    if (sample) |data| {
+        try w.writeAll("\nWhat a recorded extract returned this session, for reference:\n");
+        try w.writeAll(data);
+        try w.writeAll("\n\n");
+        try w.writeAll(save_verify_addendum);
+    }
    if (prompt) |p| {
        try w.writeAll("\nThe user's instruction for this script:\n");
        try w.writeAll(p);
@@ -1062,26 +1295,26 @@ fn buildSaveSynthesisMessage(self: *Agent, arena: std.mem.Allocator, prompt: ?[]
 /// dialect (e.g. `extract`'s schema format) without the tool schemas a no-tools
 /// synthesis turn omits.
 fn renderBuiltinCatalog(w: *std.Io.Writer) !void {
+    // The primary builtins first; `evaluate` is held back and framed as a last
+    // resort below, so it isn't presented as a peer way to read data.
    for (Schema.all()) |s| {
-        if (!s.tool.isRecorded()) continue;
-        try w.print("\n{s}(", .{s.tool_name});
-        for (s.required, 0..) |req, i| {
-            if (i != 0) try w.writeAll(", ");
-            try w.writeAll(req);
-        }
-        try w.print("):\n{s}\n", .{s.description});
+        if (!s.tool.isRecorded() or s.tool == .evaluate) continue;
+        try renderBuiltinEntry(w, s);
+    }
+    for (Schema.all()) |s| {
+        if (s.tool != .evaluate) continue;
+        try w.writeAll("\nEscape hatch for advanced page interaction or page-side logic no builtin above can express — not for reading data extract can read:\n");
+        try renderBuiltinEntry(w, s);
    }
 }

-/// Strip a surrounding ```` ```lang … ``` ```` markdown fence if the model
-/// wrapped its output in one despite being told not to.
-fn stripCodeFence(text: []const u8) []const u8 {
-    const t = std.mem.trim(u8, text, &std.ascii.whitespace);
-    if (!std.mem.startsWith(u8, t, "```")) return t;
-    const first_nl = std.mem.indexOfScalar(u8, t, '\n') orelse return t;
-    const body = t[first_nl + 1 ..];
-    const close = std.mem.lastIndexOf(u8, body, "```") orelse return std.mem.trim(u8, body, &std.ascii.whitespace);
-    return std.mem.trim(u8, body[0..close], &std.ascii.whitespace);
+fn renderBuiltinEntry(w: *std.Io.Writer, s: Schema) !void {
+    try w.print("\n{s}(", .{s.tool_name});
+    for (s.required, 0..) |req, i| {
+        if (i != 0) try w.writeAll(", ");
+        try w.writeAll(req);
+    }
+    try w.print("):\n{s}\n", .{s.description});
 }

 fn logSaveBufferError(self: *Agent, err: anyerror) void {
@@ -1309,9 +1542,9 @@ fn recordSlashToolCall(
        .arguments = if (args) |v| try zenai.json.dupeValue(ma, v) else null,
    };

-    // capToolOutput returns its input unchanged under the cap; dupe so content
-    // doesn't alias the caller's per-iteration arena.
-    const capped = capToolOutput(ma, result.text);
+    // truncateWithMarker returns its input unchanged under the cap; dupe so
+    // content doesn't alias the caller's per-iteration arena.
+    const capped = string.truncateWithMarker(ma, result.text, tool_output_max_bytes);
    const content = if (capped.ptr == result.text.ptr) try ma.dupe(u8, capped) else capped;

    const tool_results = try ma.alloc(zenai.provider.ToolResult, 1);
@@ -1415,6 +1648,13 @@ fn processUserMessage(self: *Agent, input: TurnInput) !?[]const u8 {
            if (!tc.is_error and t == .extract) last_extract_idx = i;
        }

+        // Keep the latest extract's real result so `/save` can ground and
+        // verify its synthesized post-processing against actual data.
+        if (last_extract_idx) |idx| {
+            _ = self.last_extract_arena.reset(.retain_capacity);
+            self.last_extract_json = self.last_extract_arena.allocator().dupe(u8, result.tool_calls_made[idx].result) catch null;
+        }
+
        var recorded_any = false;
        for (result.tool_calls_made, 0..) |tc, i| {
            if (tc.is_error) continue;
@@ -1546,19 +1786,14 @@ fn buildUserMessageParts(
 // the next request body) without bound.
 const tool_output_max_bytes: usize = 1 * 1024 * 1024;

-fn capToolOutput(allocator: std.mem.Allocator, output: []const u8) []const u8 {
-    if (output.len <= tool_output_max_bytes) return output;
-    const prefix = string.truncateUtf8(output, tool_output_max_bytes);
-    var suffix_buf: [64]u8 = undefined;
-    const suffix = std.fmt.bufPrint(&suffix_buf, "\n...[truncated, original {d} bytes]", .{output.len}) catch return prefix;
-    return std.mem.concat(allocator, u8, &.{ prefix, suffix }) catch prefix;
-}
-
 fn handleToolCall(ctx: *anyopaque, allocator: std.mem.Allocator, tool_name: []const u8, arguments: ?std.json.Value) zenai.provider.Client.ToolHandler.Result {
    const self: *Agent = @ptrCast(@alignCast(ctx));
+    // `run_script`'s only arg is the whole candidate script — too long and noisy
+    // to render, so suppress it and let the label/phase carry the context.
+    const is_run_script = self.active_verify != null and std.mem.eql(u8, tool_name, browser_tools.run_script_tool_name);
    // The spinner doesn't render args, and `agentToolDone` skips the body line
    // at low verbosity — don't pay for the stringify when nobody reads it.
-    const needs_args = self.terminal.spinner.isEnabled() or self.terminal.verbosity != .low;
+    const needs_args = !is_run_script and (self.terminal.spinner.isEnabled() or self.terminal.verbosity != .low);
    // Stringify the pre-substitution args so $LP_* placeholders the model
    // emitted stay redacted in the UI.
    const args_str: []const u8 = if (needs_args) (if (arguments) |v|
@@ -1568,12 +1803,15 @@ fn handleToolCall(ctx: *anyopaque, allocator: std.mem.Allocator, tool_name: []co
    self.terminal.spinner.setTool(tool_name, args_str);
    defer self.terminal.spinner.setThinking();

-    const outcome: zenai.provider.Client.ToolHandler.Result = if (browser_tools.call(allocator, self.session, &self.node_registry, tool_name, arguments)) |result|
-        .{ .content = capToolOutput(allocator, result.text), .is_error = result.is_error }
+    const outcome: zenai.provider.Client.ToolHandler.Result = if (is_run_script)
+        self.runScriptTool(allocator, arguments)
+    else if (browser_tools.call(allocator, self.session, &self.node_registry, tool_name, arguments)) |result|
+        .{ .content = string.truncateWithMarker(allocator, result.text, tool_output_max_bytes), .is_error = result.is_error }
    else |err|
        .{ .content = std.fmt.allocPrint(allocator, "Error: {s}", .{@errorName(err)}) catch "Error: tool execution failed", .is_error = true };

-    self.terminal.agentToolDone(tool_name, args_str, !outcome.is_error);
+    // run_script emits its own always-visible trace inside `runScriptTool`.
+    if (!is_run_script) self.terminal.agentToolDone(tool_name, args_str, !outcome.is_error);
    if (self.terminal.verbosity == .high) self.terminal.printToolOutcome(tool_name, outcome.content, outcome.is_error);
    return outcome;
 }
@@ -1644,35 +1882,6 @@ fn completionModels(context: *anyopaque, _: std.mem.Allocator) []const []const u
    return ids;
 }

-test "capToolOutput: passes through when under cap" {
-    const ta = std.testing.allocator;
-    const out = capToolOutput(ta, "short");
-    try std.testing.expectEqualStrings("short", out);
-}
-
-// Boundary correctness lives in string.zig's `truncateUtf8` tests; here we only
-// assert the agent-specific policy: an over-cap body keeps valid UTF-8 and gains
-// the truncation marker.
-test "capToolOutput: appends a marker when truncating" {
-    const ta = std.testing.allocator;
-
-    // 3-byte Hangul codepoint (U+D55C '한' = 0xED 0x95 0x9C) straddling the cap.
-    const cap = tool_output_max_bytes;
-    const buf = try ta.alloc(u8, cap + 8);
-    defer ta.free(buf);
-    @memset(buf[0 .. cap - 1], 'a');
-    buf[cap - 1] = 0xED;
-    buf[cap + 0] = 0x95;
-    buf[cap + 1] = 0x9C;
-    @memset(buf[cap + 2 ..], 'b');
-
-    const out = capToolOutput(ta, buf);
-    defer if (out.ptr != buf.ptr) ta.free(out);
-
-    try std.testing.expect(std.unicode.utf8ValidateSlice(out));
-    try std.testing.expect(std.mem.indexOf(u8, out, "truncated") != null);
-}
-
 test "parseSaveCommand: filename only" {
    const r = try parseSaveCommand("out.js");
    try std.testing.expectEqualStrings("out.js", r.filename.?);
@@ -1721,8 +1930,3 @@ test "renderBuiltinCatalog: lists recorded tools, omits read-only ones" {
    try std.testing.expect(std.mem.indexOf(u8, text, "tree(") == null);
    try std.testing.expect(std.mem.indexOf(u8, text, "markdown(") == null);
 }
-
-test "stripCodeFence: unwraps a fenced block and passes plain text through" {
-    try std.testing.expectEqualStrings("goto(\"x\");", stripCodeFence("```js\ngoto(\"x\");\n```"));
-    try std.testing.expectEqualStrings("goto(\"x\");", stripCodeFence("goto(\"x\");"));
-}
--- a/src/agent/Spinner.zig
+++ b/src/agent/Spinner.zig
@@ -71,6 +71,12 @@ cv: std.Thread.Condition = .{},
 state: State = .idle,
 frame: u8 = 0,

+/// Custom label for the thinking state — a phase like "writing the script".
+/// Empty falls back to "thinking". Set via `setStatus`, cleared when the turn
+/// ends so the next turn starts plain.
+status_buf: [48]u8 = undefined,
+status_len: usize = 0,
+
 tool_calls: u32 = 0,
 turn_started_ns: i128 = 0,

@@ -143,6 +149,7 @@ pub fn stop(self: *Spinner) void {
    _ = std.posix.write(std.posix.STDERR_FILENO, summary) catch {};

    self.state = .idle;
+    self.status_len = 0;
    self.last_render_len = 0;
 }

@@ -155,6 +162,7 @@ pub fn cancel(self: *Spinner) void {
    if (self.state == .idle) return;
    _ = std.posix.write(std.posix.STDERR_FILENO, "\r" ++ clear_eol) catch {};
    self.state = .idle;
+    self.status_len = 0;
    self.last_render_len = 0;
 }

@@ -188,6 +196,22 @@ pub fn setTool(self: *Spinner, name: []const u8, args: []const u8) void {
    self.cv.signal();
 }

+/// Label the thinking indicator with the current phase (e.g. "writing the
+/// script"). Stored even while a tool label is up or before `start()`, so it
+/// shows the moment the indicator next renders thinking. Cleared at turn end.
+pub fn setStatus(self: *Spinner, text: []const u8) void {
+    if (!self.isEnabled()) return;
+    self.mu.lock();
+    defer self.mu.unlock();
+    const t = truncateUtf8(text, self.status_buf.len);
+    @memcpy(self.status_buf[0..t.len], t);
+    self.status_len = t.len;
+    if (self.state == .thinking) {
+        self.renderLocked();
+        self.cv.signal();
+    }
+}
+
 /// Request a transition back to the cycling "thinking" state. The worker
 /// honors `min_tool_display_ns`: if the current tool label has not been up
 /// long enough, the flip is deferred until it has.
@@ -253,11 +277,14 @@ fn renderLocked(self: *Spinner) void {
    const glyph = braille[self.frame % braille.len];
    const written = switch (self.state) {
        .idle => return,
-        .thinking => std.fmt.bufPrint(
-            &buf,
-            "\r" ++ ansi.yellow ++ "{s}" ++ ansi.reset ++ " " ++ ansi.dim ++ "[agent: thinking]" ++ ansi.reset ++ clear_eol,
-            .{glyph},
-        ) catch return,
+        .thinking => blk: {
+            const label = if (self.status_len > 0) self.status_buf[0..self.status_len] else "thinking";
+            break :blk std.fmt.bufPrint(
+                &buf,
+                "\r" ++ ansi.yellow ++ "{s}" ++ ansi.reset ++ " " ++ ansi.dim ++ "[agent: {s}]" ++ ansi.reset ++ clear_eol,
+                .{ glyph, label },
+            ) catch return;
+        },
        .tool => |tool| blk: {
            const prefix: []const u8 = if (tool.manual) "" else "agent: ";
            const name = tool.name_buf[0..tool.name_len];
--- a/src/agent/Terminal.zig
+++ b/src/agent/Terminal.zig
@@ -212,9 +212,18 @@ pub fn endTool(self: *Terminal) void {
 /// text via the bullet character.
 pub fn agentToolDone(self: *Terminal, name: []const u8, args: []const u8, ok: bool) void {
    if (!self.verbosity.atLeast(.medium)) return;
-    const spinner_on = self.spinner.isEnabled();
+    self.emitToolBullet(name, args, ok);
+}

-    if (spinner_on) {
+/// Trace one `/save` candidate run. Unlike `agentToolDone` this is shown even at
+/// the REPL's default `.low` verbosity: the verify loop is an infrequent,
+/// user-initiated step the user needs to watch happen.
+pub fn agentVerifyRun(self: *Terminal, summary: []const u8, ok: bool) void {
+    self.emitToolBullet("run_script", summary, ok);
+}
+
+fn emitToolBullet(self: *Terminal, name: []const u8, args: []const u8, ok: bool) void {
+    if (self.spinner.isEnabled()) {
        const a = if (self.repl_arena) |*ra| ra else return;
        defer _ = a.reset(.retain_capacity);
        const bytes = formatBulletLine(a.allocator(), name, args, ok) catch return;
--- a/src/browser/tools.zig
+++ b/src/browser/tools.zig
@@ -144,37 +144,74 @@ pub const driver_guidance =
 /// Shared: the agent's `/save` feeds it to its own LLM; the MCP `save` tool
 /// hands it to the driving client as the tool description.
 pub const save_synthesis_prompt =
-    \\Write a single Lightpanda agent script (.js) that reproduces what the user
-    \\set out to do this session. Infer the goal from the whole conversation and
-    \\keep only the steps a clean, repeatable script needs — drop failed attempts,
-    \\retries, exploratory reads (tree/markdown/extract probes), and corrections.
-    \\Pick the right layer for each step:
-    \\- builtins (goto, click, fill, extract, …) for actions and for reading data;
-    \\  extract is how you pull structured data out of a page.
-    \\- plain top-level JavaScript for logic — loops, cross-page aggregation,
-    \\  filtering, string building. It runs in the script, not the page.
-    \\- evaluate(...) only for page-side JavaScript no builtin can express. It is
-    \\  an escape hatch, not a default, and cannot see the script's variables —
-    \\  interpolate any value into its string.
-    \\Stay faithful to the recorded calls: same options each one actually used.
-    \\Do NOT add a `timeout` to goto (or any tool) unless the session
-    \\did. Never round-trip a result through `lp.*`, and never append no-op
-    \\extract(...) probes or `evaluate("return lp....")` tails to surface output.
-    \\The completion value — the last top-level expression — prints automatically
-    \\(objects and arrays as JSON), so end with the bare result expression: a final
-    \\`extract({...});`, or `results;` after an aggregation loop. No console.log,
-    \\JSON.stringify, or `return` (illegal at top level) needed.
-    \\Write modern, readable JavaScript: `for (const x of xs)`, `const`/`let` over
-    \\`var`, template literals, destructuring, 2-space indent (including multi-line
-    \\extract({...}) schemas).
-    \\The script runs as a classic script, so top-level `await` is a syntax error.
-    \\The builtins are synchronous — each returns its result directly, so never
-    \\wrap them in async/await, .then, or Promises (`const data = extract(...)`,
-    \\not `await extract(...)`). evaluate(...) may run async JS in the page, but
-    \\the call itself returns synchronously.
-    \\Output ONLY JavaScript source — no markdown fences, no commentary.
+    \\Write a single Lightpanda agent script (.js) that reproduces what the user set
+    \\out to do this session. Keep only the steps a clean, repeatable script needs —
+    \\drop failed attempts, retries, and exploratory probes.
+    \\Use the builtins for actions and data — extract is the main way to read data —
+    \\and plain top-level JavaScript for logic (loops, aggregation, filtering, string
+    \\work). Reserve evaluate(...) for advanced page interaction or page-side logic no
+    \\builtin can express; it can't see the script's variables, so interpolate them into
+    \\its string. Stay faithful to the recorded calls and their options (e.g. don't add
+    \\a timeout the session didn't use).
+    \\The last top-level expression prints automatically (objects/arrays as JSON), so
+    \\end with the bare result — a final extract({...}); or results; after a loop. No
+    \\console.log, JSON.stringify, or return.
+    \\Top-level await is a syntax error and the builtins are synchronous, so never await
+    \\them (const data = extract(...), not await extract(...)).
+    \\If an output schema is given below, the completion value MUST match it — parse or
+    \\split the extracted text as needed.
+    \\Write modern, readable JavaScript (const/let, template literals, destructuring,
+    \\2-space indent). Output ONLY JavaScript — no markdown fences, no commentary.
 ;

+/// Agent `/save`, step 1: distill the session into a one-sentence intent that
+/// feeds `save_schema_prompt`. Appended to the full conversation, so the model
+/// reads the turns; an explicit anchor (a `--task` or `/save` prompt) is added
+/// by the caller when present.
+pub const save_intent_prompt =
+    \\In one sentence, state what the user set out to accomplish this session:
+    \\the goal and the data or outcome they wanted — not the individual steps,
+    \\tools, selectors, or page structure. Phrase it as a task description, e.g.
+    \\"Go to HackerNews and retrieve the top 10 stories with their last 3
+    \\comments (author and text)". Output ONLY that one sentence.
+;
+
+/// Agent `/save`, step 2: turn the step-1 intent into a logical output schema.
+/// Given ONLY the intent — deliberately blind to the page and how data was
+/// fetched — so the resulting shape is stable across runs of the same session.
+pub const save_schema_prompt =
+    \\Generate the JSON output schema describing the following intent. Do not
+    \\focus on the intent context (the actual webpage structure or how to
+    \\retrieve the data), just the logical JSON schema example. Do not provide
+    \\actual data, just data types ("string", "number", "boolean").
+    \\Example: the intent "Go to HackerNews and retrieve the top 10 stories with
+    \\their last 3 comments with author and text" produces:
+    \\{"results": [{"title": "string", "url": "string", "comments": [{"author": "string", "text": "string"}]}]}
+    \\Output ONLY the JSON schema, no markdown fences, no commentary.
+    \\
+    \\Intent:
+;
+
+/// Name of the agent `/save` verification tool — the model calls it to run a
+/// candidate script before finalizing. The agent dispatches it by this name.
+pub const run_script_tool_name = "run_script";
+
+const run_script_tool_desc =
+    "Run your full candidate script for real, from a blank page, and return its completion value " ++
+    "(or error) — exactly as it will run when saved and replayed. It must navigate itself with " ++
+    "goto(...). Use it to verify navigation, extraction, and your transform before finalizing.";
+
+const run_script_params_json =
+    \\{"type":"object","properties":{"source":{"type":"string","description":"Full JavaScript source of the candidate script to execute."}},"required":["source"]}
+;
+
+/// The `run_script` tool definition for an LLM `/save` synthesis turn. `arena`
+/// backs the parsed parameter schema, so it must outlive the `runTools` call.
+pub fn runScriptToolDef(arena: std.mem.Allocator) !zenai.provider.Tool {
+    const params = try std.json.parseFromSliceLeaky(std.json.Value, arena, run_script_params_json, .{});
+    return .{ .name = run_script_tool_name, .description = run_script_tool_desc, .parameters = params };
+}
+
 /// Reject paths that an untrusted MCP client could use to escape the
 /// working directory: empty paths, absolute paths, and any path with a
 /// `..` segment. Operator-controlled symlinks already inside CWD are out
--- a/src/script/Runtime.zig
+++ b/src/script/Runtime.zig
@@ -95,6 +95,13 @@ pub const RunError = error{
    OutOfMemory,
 };

+/// A captured script run: `err` is the formatted failure (null on success);
+/// `output` is the completion value's display string (empty when void).
+pub const RunOutcome = struct {
+    err: ?[]const u8 = null,
+    output: []const u8 = "",
+};
+
 pub fn init(
    allocator: std.mem.Allocator,
    app: *lp.App,
@@ -238,6 +245,23 @@ fn setObjectProperty(
 /// compile/runtime exception returns a formatted error allocated in this
 /// runtime's call arena and valid until deinit or the next run.
 pub fn runSource(self: *Runtime, source: []const u8, name: []const u8) RunError!?[]const u8 {
+    return (try self.runInner(source, name, false)).err;
+}
+
+/// Like `runSource`, but capture the completion value's display string instead
+/// of printing it (used by `/save` verification to feed output back to the LLM).
+/// Both fields live in the call arena — valid until the next run or deinit.
+///
+/// Each call gets a fresh global context so a prior candidate's top-level
+/// `const`/`let` doesn't collide ("Identifier 'x' has already been declared")
+/// — verification candidates are independent runs, unlike a `/load` script.
+pub fn runSourceCapture(self: *Runtime, source: []const u8, name: []const u8) RunError!RunOutcome {
+    self.resetContext();
+    self.createContext() catch return .{ .err = try self.dupeError("script context reset failed") };
+    return self.runInner(source, name, true);
+}
+
+fn runInner(self: *Runtime, source: []const u8, name: []const u8, capture: bool) RunError!RunOutcome {
    _ = self.call_arena.reset(.retain_capacity);

    var hs: lp.js.HandleScope = undefined;
@@ -245,7 +269,7 @@ pub fn runSource(self: *Runtime, source: []const u8, name: []const u8) RunError!
    defer hs.deinit();

    const context: *const v8.Context = @ptrCast(v8.v8__Global__Get(&self.context, self.env.isolate.handle) orelse
-        return try self.dupeError("agent script context is not available"));
+        return .{ .err = try self.dupeError("agent script context is not available") });
    v8.v8__Context__Enter(context);
    defer v8.v8__Context__Exit(context);

@@ -268,19 +292,27 @@ pub fn runSource(self: *Runtime, source: []const u8, name: []const u8) RunError!
        &compiler_source,
        v8.kNoCompileOptions,
        v8.kNoCacheNoReason,
-    ) orelse return try self.formatCaught(context, &try_catch, "compile failed");
+    ) orelse return .{ .err = try self.formatCaught(context, &try_catch, "compile failed") };

    const completion = v8.v8__Script__Run(script, context) orelse
-        return try self.formatCaught(context, &try_catch, "script failed");
+        return .{ .err = try self.formatCaught(context, &try_catch, "script failed") };

    // Explicit microtask policy: promise continuations only run once drained.
    self.env.performIsolateMicrotasks();
    if (v8.v8__TryCatch__HasCaught(&try_catch)) {
-        return try self.formatCaught(context, &try_catch, "script failed");
+        return .{ .err = try self.formatCaught(context, &try_catch, "script failed") };
    }

+    if (capture) {
+        if (v8.v8__Value__IsUndefined(completion)) return .{};
+        const output = self.displayString(self.call_arena.allocator(), context, completion) catch |err| switch (err) {
+            error.OutOfMemory => return error.OutOfMemory,
+            error.JsException => return .{ .output = "<completion value could not be serialized>" },
+        };
+        return .{ .output = output };
+    }
    self.printCompletion(context, completion);
-    return null;
+    return .{};
 }

 /// Echo a script's completion value (its last-evaluated expression) so a script
@@ -677,10 +709,10 @@ test "agent script runtime: goto and evaluate dispatch through browser tools" {
    defer testing.reset();
    defer if (testing.test_session.hasPage()) testing.test_session.removePage();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    try runTestScript(runtime,
@@ -698,10 +730,10 @@ test "agent script runtime: extract returns a JavaScript object" {
    defer testing.reset();
    defer if (testing.test_session.hasPage()) testing.test_session.removePage();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    try runTestScript(runtime,
@@ -752,10 +784,10 @@ test "agent script runtime: extract tolerates list selectors that match nothing"
    defer testing.reset();
    defer if (testing.test_session.hasPage()) testing.test_session.removePage();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    try runTestScript(runtime,
@@ -780,10 +812,10 @@ test "agent script runtime: strict-mode scripts can call primitives" {
    defer testing.reset();
    defer if (testing.test_session.hasPage()) testing.test_session.removePage();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    try runTestScript(runtime,
@@ -798,10 +830,10 @@ test "agent script runtime: strict-mode scripts can call primitives" {
 test "agent script runtime: promise microtasks run to completion" {
    defer testing.reset();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    try runTestScript(runtime,
@@ -819,10 +851,10 @@ test "agent script runtime: primitives re-entered from argument callbacks stay i
    defer testing.reset();
    defer if (testing.test_session.hasPage()) testing.test_session.removePage();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    try runTestScript(runtime,
@@ -841,10 +873,10 @@ test "agent script runtime: terminate interrupts local JavaScript" {
    defer testing.reset();
    defer if (testing.test_session.hasPage()) testing.test_session.removePage();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    const thread = try std.Thread.spawn(.{}, terminateRuntimeSoon, .{runtime});
@@ -859,10 +891,10 @@ test "agent script runtime: agent variables persist and page globals are isolate
    defer testing.reset();
    defer if (testing.test_session.hasPage()) testing.test_session.removePage();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    try runTestScript(runtime,
@@ -884,10 +916,10 @@ test "agent script runtime: page evaluate cannot see agent primitives or binding
    defer testing.reset();
    defer if (testing.test_session.hasPage()) testing.test_session.removePage();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    try runTestScript(runtime,
@@ -902,10 +934,10 @@ test "agent script runtime: page evaluate cannot see agent primitives or binding
 test "agent script runtime: console is available in agent context" {
    defer testing.reset();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    try runTestScript(runtime,
@@ -919,10 +951,10 @@ test "agent script runtime: tool errors throw and stop execution" {
    defer testing.reset();
    defer if (testing.test_session.hasPage()) testing.test_session.removePage();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    const message = (try runtime.runSource(
@@ -945,10 +977,10 @@ test "agent script runtime: builtin argument marshalling (positional + options)"
    defer testing.reset();
    defer if (testing.test_session.hasPage()) testing.test_session.removePage();

-    var registry = CDPNode.Registry.init(testing.allocator);
+    var registry: CDPNode.Registry = .init(testing.allocator);
    defer registry.deinit();

-    const runtime = try Runtime.init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
    defer runtime.deinit();

    try runTestScript(runtime,
@@ -994,3 +1026,72 @@ test "agent script runtime: builtin argument marshalling (positional + options)"
        try testing.expect(std.mem.indexOf(u8, message, "invalid arguments") != null);
    }
 }
+
+test "agent script runtime: runSourceCapture runs the full script live and captures completion" {
+    defer testing.reset();
+    defer if (testing.test_session.hasPage()) testing.test_session.removePage();
+
+    var registry: CDPNode.Registry = .init(testing.allocator);
+    defer registry.deinit();
+
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    defer runtime.deinit();
+
+    // Verification runs the candidate exactly as a standalone replay does — it
+    // must navigate itself; extract then runs against the page it loaded.
+    const outcome = try runtime.runSourceCapture(
+        \\goto("http://localhost:9582/src/browser/tests/mcp_actions.html");
+        \\click("#btn");
+        \\const data = extract({ label: "#btn" });
+        \\data.label;
+    , "candidate.js");
+    try testing.expect(outcome.err == null);
+    try std.testing.expectEqualStrings("Click Me", outcome.output);
+}
+
+test "agent script runtime: runSourceCapture surfaces a candidate's error" {
+    defer testing.reset();
+    defer if (testing.test_session.hasPage()) testing.test_session.removePage();
+
+    var registry: CDPNode.Registry = .init(testing.allocator);
+    defer registry.deinit();
+
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    defer runtime.deinit();
+
+    // extract returns an object; treating it as an array throws — the real
+    // `raw.map is not a function` failure the model must see and fix.
+    const bad = try runtime.runSourceCapture(
+        \\goto("http://localhost:9582/src/browser/tests/mcp_actions.html");
+        \\const raw = extract({ items: [{ selector: "li" }] });
+        \\raw.map(x => x);
+    , "candidate.js");
+    try testing.expect(bad.err != null);
+}
+
+test "agent script runtime: each capture run gets a fresh global scope" {
+    defer testing.reset();
+    defer if (testing.test_session.hasPage()) testing.test_session.removePage();
+
+    var registry: CDPNode.Registry = .init(testing.allocator);
+    defer registry.deinit();
+
+    const runtime: *Runtime = try .init(testing.allocator, testing.test_app, testing.test_session, &registry);
+    defer runtime.deinit();
+
+    // The same top-level `const` in two consecutive candidates must not collide —
+    // each run starts from a clean context, not the previous run's globals.
+    const src =
+        \\goto("http://localhost:9582/src/browser/tests/mcp_actions.html");
+        \\const data = extract({ label: "#btn" });
+        \\const out = data.label;
+        \\out;
+    ;
+    const first = try runtime.runSourceCapture(src, "candidate.js");
+    try testing.expect(first.err == null);
+    try std.testing.expectEqualStrings("Click Me", first.output);
+
+    const second = try runtime.runSourceCapture(src, "candidate.js");
+    try testing.expect(second.err == null);
+    try std.testing.expectEqualStrings("Click Me", second.output);
+}
--- a/src/string.zig
+++ b/src/string.zig
@@ -332,6 +332,29 @@ pub fn truncateUtf8(bytes: []const u8, max_bytes: usize) []const u8 {
    return bytes[0..i];
 }

+/// Truncate `text` to at most `max_bytes` on a UTF-8 boundary and, when it
+/// overflows, append a marker noting the original length. Returns `text`
+/// unchanged when it fits; otherwise allocates the marked result in `allocator`
+/// (falling back to the bare prefix if that allocation fails).
+pub fn truncateWithMarker(allocator: std.mem.Allocator, text: []const u8, max_bytes: usize) []const u8 {
+    if (text.len <= max_bytes) return text;
+    const prefix = truncateUtf8(text, max_bytes);
+    var suffix_buf: [64]u8 = undefined;
+    const suffix = std.fmt.bufPrint(&suffix_buf, "\n...[truncated, original {d} bytes]", .{text.len}) catch return prefix;
+    return std.mem.concat(allocator, u8, &.{ prefix, suffix }) catch prefix;
+}
+
+/// Strip a surrounding ```lang … ``` markdown fence if the text is wrapped in
+/// one, returning the inner block; passes already-bare text through unchanged.
+pub fn stripCodeFence(text: []const u8) []const u8 {
+    const t = std.mem.trim(u8, text, &std.ascii.whitespace);
+    if (!std.mem.startsWith(u8, t, "```")) return t;
+    const first_nl = std.mem.indexOfScalar(u8, t, '\n') orelse return t;
+    const body = t[first_nl + 1 ..];
+    const close = std.mem.lastIndexOf(u8, body, "```") orelse return std.mem.trim(u8, body, &std.ascii.whitespace);
+    return std.mem.trim(u8, body[0..close], &std.ascii.whitespace);
+}
+
 // Discriminatory type that signals the bridge to use arena instead of call_arena
 // Use this for strings that need to persist beyond the current call
 // The caller can unwrap and store just the underlying .str field
@@ -378,6 +401,32 @@ test "truncateUtf8" {
    try testing.expectEqual("\xFFx", truncateUtf8("\xFFx", 2));
 }

+test "truncateWithMarker" {
+    const ta = std.testing.allocator;
+    try std.testing.expectEqualStrings("short", truncateWithMarker(ta, "short", 1024));
+
+    // Over-cap: a 3-byte Hangul codepoint (U+D55C, 0xED 0x95 0x9C) straddling the
+    // cap must stay valid UTF-8, and the marker must be appended.
+    const cap: usize = 1024;
+    const buf = try ta.alloc(u8, cap + 8);
+    defer ta.free(buf);
+    @memset(buf[0 .. cap - 1], 'a');
+    buf[cap - 1] = 0xED;
+    buf[cap + 0] = 0x95;
+    buf[cap + 1] = 0x9C;
+    @memset(buf[cap + 2 ..], 'b');
+
+    const out = truncateWithMarker(ta, buf, cap);
+    defer if (out.ptr != buf.ptr) ta.free(out);
+    try std.testing.expect(std.unicode.utf8ValidateSlice(out));
+    try std.testing.expect(std.mem.indexOf(u8, out, "truncated") != null);
+}
+
+test "stripCodeFence" {
+    try std.testing.expectEqualStrings("goto(\"x\");", stripCodeFence("```js\ngoto(\"x\");\n```"));
+    try std.testing.expectEqualStrings("goto(\"x\");", stripCodeFence("goto(\"x\");"));
+}
+
 test "String" {
    const other_short = try String.init(undefined, "other_short", .{});
    const other_long = try String.init(testing.allocator, "other_long" ** 100, .{});