agent: enhance /save progress and verification feedback

- Add step-by-step progress reporting during script synthesis. - Introduce yellow warning bullets for superseded verification runs. - Refactor bullet emission in Terminal to support multiple statuses. - Fix spinner formatting when tool arguments are empty.
2026-08-02 18:59:36 -04:00 · 2026-06-09 17:35:31 +02:00
parent b141da30ca
commit 54f467d1fa
3 changed files with 110 additions and 87 deletions
--- a/src/agent/Agent.zig
+++ b/src/agent/Agent.zig
@@ -110,13 +110,10 @@ node_registry: CDPNode.Registry,
 terminal: Terminal,
 save_buffer: Recorder,
 save_path: ?[]u8,
-/// Backs `last_extract_json`; reset alongside `save_buffer`.
 last_extract_arena: std.heap.ArenaAllocator,
-/// The JSON the most recent successful `extract` returned this session — the
-/// real data `/save` grounds and verifies its synthesized script against.
+/// JSON the latest `extract` returned this session; grounds `/save` synthesis.
 last_extract_json: ?[]const u8 = null,
-/// Set for the duration of an LLM `/save` so the `run_script` tool can reach
-/// the dry-run runtime it executes candidates on.
+/// Set during an LLM `/save` so `handleToolCall` can route `run_script`.
 active_verify: ?*Verify = null,
 script_runtime_mutex: std.Thread.Mutex = .{},
 active_script_runtime: ?*ScriptRuntime = null,
@@ -976,12 +973,15 @@ fn abortSave(self: *Agent, baseline: usize, reason: []const u8) void {
    self.failSave(reason);
 }

-/// In-flight `/save` verification harness: the dry-run runtime the `run_script`
-/// tool executes candidates on, plus the last source it ran (a fallback script
-/// if the model finishes the loop without re-emitting it as text).
+/// `/save` verification state: the runtime `run_script` executes candidates on,
+/// and the last source that ran cleanly (the saved script if the model's final
+/// message omits it).
 const Verify = struct {
    runtime: *ScriptRuntime,
    last_source: ?[]const u8 = null,
+    /// A clean run whose bullet is held back until we know its verdict: yellow if
+    /// a re-run supersedes it, green if it's the one we keep.
+    pending_ok: bool = false,
 };

 /// Agent-only addendum (kept out of the shared `save_synthesis_prompt`) telling
@@ -997,8 +997,8 @@ const save_verify_addendum =
    \\JavaScript source.
 ;

-/// Cap on the captured extract sample shown in the synthesis prompt (the full
-/// data still feeds the dry run); keeps a large result from dominating context.
+/// Cap on the extract sample shown in the synthesis prompt, so a large result
+/// doesn't dominate context.
 const save_sample_cap = 8 * 1024;

 /// LLM-synthesized `/save`. Pin the output shape first — derive the session's
@@ -1009,6 +1009,11 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8,
    self.conversation.ensureSystemPrompt() catch return self.failSave("out of memory");
    const baseline = self.conversation.messages.items.len;

+    // One spinner session for the save; cancel (not stop) leaves the phase steps
+    // without a per-turn "worked for" summary.
+    self.terminal.spinner.start();
+    defer self.terminal.spinner.cancel();
+
    const anchor = prompt orelse self.one_shot_task;
    const schema = self.deriveOutputSchema(arena, baseline, anchor);
    if (self.cancel_requested.load(.acquire)) {
@@ -1019,18 +1024,20 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8,
    self.synthesizeScript(arena, filename, prompt, schema);
 }

-/// Steps 1–2 of `/save`: intent (over the session) → typed output schema. Both
-/// turns leave the conversation as they found it; returns null if either turn
-/// produced nothing usable (the caller then synthesizes without a schema).
+/// Steps 1–2 of `/save`: session intent → typed output schema. Null if either
+/// turn produced nothing usable (the caller then synthesizes without a schema).
 fn deriveOutputSchema(self: *Agent, arena: std.mem.Allocator, baseline: usize, anchor: ?[]const u8) ?[]const u8 {
    const intent = self.deriveIntent(arena, baseline, anchor) orelse return null;
+    self.terminal.agentStep("captured the intent");
    if (self.cancel_requested.load(.acquire)) return null;
-    return self.deriveSchema(arena, intent);
+    const schema = self.deriveSchema(arena, intent) orelse return null;
+    self.terminal.agentStep("generated output schema");
+    return schema;
 }

-/// One-sentence intent from the session turns. Runs over the live conversation
-/// (so the model sees the session) but rolls back to `baseline`, keeping the
-/// turn out of history. An explicit anchor is folded in as authoritative.
+/// One-sentence intent from the session. Runs over the live conversation then
+/// rolls back to `baseline`, keeping the turn out of history; an explicit anchor
+/// is authoritative.
 fn deriveIntent(self: *Agent, arena: std.mem.Allocator, baseline: usize, anchor: ?[]const u8) ?[]const u8 {
    const ma = self.conversation.arena.allocator();
    var out: std.Io.Writer.Allocating = .init(ma);
@@ -1040,12 +1047,11 @@ fn deriveIntent(self: *Agent, arena: std.mem.Allocator, baseline: usize, anchor:
    }
    self.conversation.messages.append(self.allocator, .{ .role = .user, .content = out.written() }) catch return null;
    defer self.conversation.rollback(baseline);
-    return self.runTextTurn(&self.conversation.messages, arena, self.allocator, ma, 512, "understanding the task");
+    return self.runTextTurn(&self.conversation.messages, arena, self.allocator, ma, 512, "capturing the intent");
 }

-/// Typed output schema from the intent. Runs over a throwaway message list —
-/// not the conversation — so the schema is derived from the logical intent
-/// alone, blind to the page structure and how the data was fetched.
+/// Typed output schema from the intent. Runs over a throwaway message list (not
+/// the conversation) so it's derived from the intent alone, blind to the page.
 fn deriveSchema(self: *Agent, arena: std.mem.Allocator, intent: []const u8) ?[]const u8 {
    var msgs: std.ArrayList(zenai.provider.Message) = .empty;
    const msg = std.fmt.allocPrint(arena, "{s} {s}", .{ browser_tools.save_schema_prompt, intent }) catch return null;
@@ -1054,9 +1060,8 @@ fn deriveSchema(self: *Agent, arena: std.mem.Allocator, intent: []const u8) ?[]c
    return string.stripCodeFence(raw);
 }

-/// Run a single no-tools text turn over `messages` and return the model's text
-/// duped into `dest` (so it survives any rollback of `messages`), or null on
-/// cancel, error, or empty output. Shared by the intent and schema steps.
+/// Single no-tools text turn; returns the model's text duped into `dest` (so it
+/// survives a rollback of `messages`), or null on cancel/error/empty output.
 fn runTextTurn(
    self: *Agent,
    messages: *std.ArrayList(zenai.provider.Message),
@@ -1066,7 +1071,6 @@ fn runTextTurn(
    max_tokens: i32,
    status: []const u8,
 ) ?[]const u8 {
-    self.terminal.spinner.start();
    self.terminal.spinner.setStatus(status);
    var result = self.ai_client.?.runTools(
        self.model,
@@ -1083,11 +1087,9 @@ fn runTextTurn(
            .cancel = .{ .context = @ptrCast(self), .checkFn = checkCancel },
        },
    ) catch |err| {
-        self.terminal.spinner.cancel();
        if (!self.cancel_requested.load(.acquire)) log.err(.app, "AI save schema turn error", .{ .err = err });
        return null;
    };
-    self.terminal.spinner.stop();
    defer result.deinit();
    self.total_usage.add(result.usage);
    if (result.cancelled) return null;
@@ -1096,9 +1098,8 @@ fn runTextTurn(
    return dest.dupe(u8, text) catch null;
 }

-/// Step 3 of `/save`: hand the model the builtin catalog, the full conversation,
-/// the deterministic record of what ran, and the required output schema, then
-/// write the idiomatic script it returns.
+/// Step 3 of `/save`: synthesize the script from the catalog, conversation,
+/// recorded calls, and output schema, then write it.
 fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8, prompt: ?[]const u8, schema: ?[]const u8) void {
    const provider_client = self.ai_client.?;

@@ -1110,13 +1111,12 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
    const ma = self.conversation.arena.allocator();
    const baseline = self.conversation.messages.items.len;

-    // When the session captured extract data, let the model test candidates on
-    // it via `run_script`; otherwise fall back to a single no-tools synthesis.
+    // With captured extract data, give the model `run_script` to test candidates;
+    // otherwise a single no-tools synthesis.
    var verify: Verify = .{ .runtime = undefined };
    var run_tools: [1]ProviderTool = undefined;
    const verifying = blk: {
-        // Gate on a captured extract: it means the session loaded the page and
-        // left it in a state worth verifying against (and gives a prompt sample).
+        // A captured extract means there's a loaded page worth verifying against.
        if (self.last_extract_json == null) break :blk false;
        run_tools[0] = browser_tools.runScriptToolDef(ma) catch break :blk false;
        const runtime = ScriptRuntime.init(self.allocator, self.browser.app, self.session, &self.node_registry) catch break :blk false;
@@ -1143,7 +1143,6 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
    const user_msg = self.buildSaveSynthesisMessage(ma, prompt, schema, sample) catch return self.failSave("out of memory");
    self.conversation.messages.append(self.allocator, .{ .role = .user, .content = user_msg }) catch return self.failSave("out of memory");

-    self.terminal.spinner.start();
    self.terminal.spinner.setStatus(if (verifying) "writing and testing the script" else "writing the script");
    var result = provider_client.runTools(
        self.model,
@@ -1160,7 +1159,6 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
            .cancel = .{ .context = @ptrCast(self), .checkFn = checkCancel },
        },
    ) catch |err| {
-        self.terminal.spinner.cancel();
        if (self.cancel_requested.load(.acquire)) {
            self.resetAfterCancel(baseline);
            return;
@@ -1168,7 +1166,6 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
        log.err(.app, "AI save synthesis error", .{ .err = err });
        return self.abortSave(baseline, @errorName(err));
    };
-    self.terminal.spinner.stop();
    defer result.deinit();
    self.total_usage.add(result.usage);

@@ -1177,10 +1174,8 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
        return;
    }

-    // Prefer the last candidate that ran cleanly — it's verified, pure JS, with
-    // none of the commentary the model sometimes wraps its final message in. Fall
-    // back to the final text only when nothing ran (no extract data, or it never
-    // called run_script).
+    // Prefer the last cleanly-run candidate: verified, pure JS without the model's
+    // surrounding commentary. Fall back to the final text only when nothing ran.
    const raw: []const u8 = blk: {
        if (verifying) {
            if (verify.last_source) |s| break :blk s;
@@ -1191,8 +1186,7 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
        return self.abortSave(baseline, "the model returned no script");
    };

-    // `raw` lives in the conversation arena, freed by the rollback below; copy
-    // into the command arena first (scrubbing may return its input as-is).
+    // `raw` is freed by the rollback below; copy into the command arena first.
    const owned = arena.dupe(u8, string.stripCodeFence(raw)) catch return self.abortSave(baseline, "out of memory");
    const script = browser_tools.reverseSubstituteEnvVars(arena, owned) catch return self.abortSave(baseline, "out of memory");

@@ -1206,38 +1200,50 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u

    self.rememberSavePath(path);
    self.resetSaveBuffers();
-    self.terminal.printInfo("Saved synthesized script to {s}", .{path});
+    self.flushPendingRun(.ok);
+    self.terminal.agentStep(std.fmt.allocPrint(arena, "saved synthesized script to {s}", .{path}) catch "saved synthesized script");
 }

-/// `run_script` tool handler: execute `source` on the dry-run runtime and hand
-/// the model back the completion value (or the error), so it can judge and fix
-/// its own script against real data.
+/// Emit the clean run held back by `runScriptTool`, colored by `status` (warn if
+/// a re-run superseded it, ok if it's the one we kept), at most once.
+fn flushPendingRun(self: *Agent, status: Terminal.BulletStatus) void {
+    const verify = self.active_verify orelse return;
+    if (!verify.pending_ok) return;
+    verify.pending_ok = false;
+    self.terminal.agentVerifyRun("", status);
+}
+
+/// `run_script` handler: run the candidate live and return its completion value
+/// (or error) to the model so it can judge and fix its own script.
 fn runScriptTool(self: *Agent, allocator: std.mem.Allocator, arguments: ?std.json.Value) zenai.provider.Client.ToolHandler.Result {
    const verify = self.active_verify.?;
+    // This call supersedes any clean run held back from the previous one.
+    self.flushPendingRun(.warn);
+
    const args = browser_tools.parseArgsOrDefault(struct { source: []const u8 = "" }, allocator, arguments) catch
        return .{ .content = "invalid run_script arguments", .is_error = true };
    const source = args.source;
    if (source.len == 0) return .{ .content = "run_script requires a non-empty \"source\" string", .is_error = true };

-    // Start each candidate from a blank page, exactly like a standalone replay —
-    // so a script that forgets to goto(...) fails here instead of silently relying
-    // on the page the session left loaded.
+    // Blank page per candidate, like a standalone replay, so a script missing
+    // goto(...) fails here instead of using the page the session left loaded.
    if (self.session.hasPage()) self.session.removePage();

    const outcome = verify.runtime.runSourceCapture(source, "candidate.js") catch
        return .{ .content = "out of memory running candidate", .is_error = true };
    if (outcome.err) |e| {
-        self.terminal.agentVerifyRun(oneLinePreview(allocator, e, 120), false);
+        self.terminal.agentVerifyRun(oneLinePreview(allocator, e, 120), .fail);
        return .{ .content = std.fmt.allocPrint(allocator, "Script threw: {s}", .{e}) catch "Script threw an error", .is_error = true };
    }

    // Keep the last source that ran cleanly — it's the verified, prose-free
    // artifact `synthesizeScript` saves, instead of the model's final message
-    // (which may wrap the script in commentary).
+    // (which may wrap the script in commentary). Hold its bullet until we know
+    // whether the model keeps this run or tries another.
    verify.last_source = self.conversation.arena.allocator().dupe(u8, source) catch source;
+    verify.pending_ok = true;

    const body = if (outcome.output.len == 0) "(completion value is empty/undefined)" else outcome.output;
-    self.terminal.agentVerifyRun(oneLinePreview(allocator, body, 120), true);
    const content = std.fmt.allocPrint(allocator, "Completion value:\n{s}", .{body}) catch body;
    return .{ .content = string.truncateWithMarker(allocator, content, tool_output_max_bytes), .is_error = false };
 }
--- a/src/agent/Spinner.zig
+++ b/src/agent/Spinner.zig
@@ -302,10 +302,12 @@ fn renderLocked(self: *Spinner) void {
            const cap = @min(max_args_cells, room);
            const cut = truncToCells(all_args, cap);
            const suffix: []const u8 = if (cut < all_args.len) ellipsis else "";
+            // No space between name and args when there are none (e.g. run_script).
+            const sep: []const u8 = if (cut == 0) "" else " ";
            break :blk std.fmt.bufPrint(
                &buf,
-                "\r" ++ ansi.yellow ++ "{s}" ++ ansi.reset ++ " " ++ ansi.dim ++ "[{s}{s} {s}{s}]" ++ ansi.reset ++ clear_eol,
-                .{ glyph, prefix, name, all_args[0..cut], suffix },
+                "\r" ++ ansi.yellow ++ "{s}" ++ ansi.reset ++ " " ++ ansi.dim ++ "[{s}{s}{s}{s}{s}]" ++ ansi.reset ++ clear_eol,
+                .{ glyph, prefix, name, sep, all_args[0..cut], suffix },
            ) catch return;
        },
    };
--- a/src/agent/Terminal.zig
+++ b/src/agent/Terminal.zig
@@ -193,8 +193,6 @@ pub fn deinit(self: *Terminal) void {
    if (self.repl_arena) |*a| a.deinit();
 }

-const bullet_line_fmt = "{s}●{s} {s}[tool: {s}]{s} {s}\n";
-
 /// Mark the start of a manual REPL tool call. Pairs with `endTool`.
 pub fn beginTool(self: *Terminal, name: []const u8, args: []const u8) void {
    self.spinner.setTool(name, args);
@@ -206,47 +204,64 @@ pub fn endTool(self: *Terminal) void {
    self.spinner.cancel();
 }

-/// Called after the tool returns. At `medium`+, commits a `● [tool: …]` line
-/// above the spinner (green/red bullet for ok/fail) so the run leaves a trace.
-/// ANSI is emitted even in non-TTY contexts — pipes that strip color see plain
-/// text via the bullet character.
+/// Bullet color for a committed `●` line: ok=green, warn=yellow, fail=red.
+pub const BulletStatus = enum {
+    ok,
+    warn,
+    fail,
+
+    fn color(self: BulletStatus) []const u8 {
+        return switch (self) {
+            .ok => ansi.green,
+            .warn => ansi.yellow,
+            .fail => ansi.red,
+        };
+    }
+};
+
+/// A completed step in a multi-phase agent operation (e.g. `/save`'s "captured
+/// the intent"). Committed above the spinner at any verbosity.
+pub fn agentStep(self: *Terminal, text: []const u8) void {
+    self.emitBullet(.ok, "{s}{s}{s}", .{ ansi.dim, text, ansi.reset });
+}
+
+/// Called after a tool returns. At `medium`+, commits a `● [tool: …]` line above
+/// the spinner (green/red bullet for ok/fail) so the run leaves a trace.
 pub fn agentToolDone(self: *Terminal, name: []const u8, args: []const u8, ok: bool) void {
    if (!self.verbosity.atLeast(.medium)) return;
-    self.emitToolBullet(name, args, ok);
+    self.emitToolBullet(name, args, if (ok) .ok else .fail);
 }

-/// Trace one `/save` candidate run. Unlike `agentToolDone` this is shown even at
-/// the REPL's default `.low` verbosity: the verify loop is an infrequent,
-/// user-initiated step the user needs to watch happen.
-pub fn agentVerifyRun(self: *Terminal, summary: []const u8, ok: bool) void {
-    self.emitToolBullet("run_script", summary, ok);
+/// Trace one `/save` candidate run — shown even at the REPL's default `.low`
+/// verbosity. `detail` carries the error on failure, empty otherwise. `status`:
+/// ok=the kept run, warn=superseded by a re-run, fail=errored.
+pub fn agentVerifyRun(self: *Terminal, detail: []const u8, status: BulletStatus) void {
+    self.emitToolBullet("run_script", detail, status);
 }

-fn emitToolBullet(self: *Terminal, name: []const u8, args: []const u8, ok: bool) void {
+fn emitToolBullet(self: *Terminal, name: []const u8, args: []const u8, status: BulletStatus) void {
+    if (args.len == 0)
+        self.emitBullet(status, "{s}[tool: {s}]{s}", .{ ansi.dim, name, ansi.reset })
+    else
+        self.emitBullet(status, "{s}[tool: {s}]{s} {s}", .{ ansi.dim, name, ansi.reset, args });
+}
+
+/// Commit a `● <body>` line above the spinner (or to stderr when it's off);
+/// `status` colors the bullet, which doubles as a plain-text marker for pipes.
+/// Shared by phase steps and tool-call traces.
+fn emitBullet(self: *Terminal, status: BulletStatus, comptime fmt: []const u8, args: anytype) void {
+    const bullet = status.color();
    if (self.spinner.isEnabled()) {
        const a = if (self.repl_arena) |*ra| ra else return;
        defer _ = a.reset(.retain_capacity);
-        const bytes = formatBulletLine(a.allocator(), name, args, ok) catch return;
-        _ = self.spinner.emitAbove(bytes);
+        const body = std.fmt.allocPrint(a.allocator(), fmt, args) catch return;
+        const line = std.fmt.allocPrint(a.allocator(), "{s}●{s} {s}\n", .{ bullet, ansi.reset, body }) catch return;
+        _ = self.spinner.emitAbove(line);
        return;
    }
-    if (self.stderr_is_tty) {
-        const bullet_color = if (ok) ansi.green else ansi.red;
-        std.debug.print(bullet_line_fmt, .{ bullet_color, ansi.reset, ansi.dim, name, ansi.reset, args });
-    } else {
-        std.debug.print(
-            "{s}{s}[tool: {s}]{s} {s}\n",
-            .{ ansi.dim, ansi.cyan, name, ansi.reset, args },
-        );
-    }
-}
-
-fn formatBulletLine(arena: std.mem.Allocator, name: []const u8, args: []const u8, ok: bool) ![]const u8 {
-    var aw: std.Io.Writer.Allocating = .init(arena);
-    const w = &aw.writer;
-    const bullet_color = if (ok) ansi.green else ansi.red;
-    try w.print(bullet_line_fmt, .{ bullet_color, ansi.reset, ansi.dim, name, ansi.reset, args });
-    return aw.written();
+    std.debug.print("{s}●{s} ", .{ bullet, ansi.reset });
+    std.debug.print(fmt, args);
+    std.debug.print("\n", .{});
 }

 const completion_buf_len = 512;