agent: enhance /save progress and verification feedback

- Add step-by-step progress reporting during script synthesis.
- Introduce yellow warning bullets for superseded verification runs.
- Refactor bullet emission in Terminal to support multiple statuses.
- Fix spinner formatting when tool arguments are empty.
This commit is contained in:
Adrià Arrufat
2026-06-09 17:35:31 +02:00
parent b141da30ca
commit 54f467d1fa
3 changed files with 110 additions and 87 deletions

View File

@@ -110,13 +110,10 @@ node_registry: CDPNode.Registry,
terminal: Terminal,
save_buffer: Recorder,
save_path: ?[]u8,
/// Backs `last_extract_json`; reset alongside `save_buffer`.
last_extract_arena: std.heap.ArenaAllocator,
/// The JSON the most recent successful `extract` returned this session — the
/// real data `/save` grounds and verifies its synthesized script against.
/// JSON the latest `extract` returned this session; grounds `/save` synthesis.
last_extract_json: ?[]const u8 = null,
/// Set for the duration of an LLM `/save` so the `run_script` tool can reach
/// the dry-run runtime it executes candidates on.
/// Set during an LLM `/save` so `handleToolCall` can route `run_script`.
active_verify: ?*Verify = null,
script_runtime_mutex: std.Thread.Mutex = .{},
active_script_runtime: ?*ScriptRuntime = null,
@@ -976,12 +973,15 @@ fn abortSave(self: *Agent, baseline: usize, reason: []const u8) void {
self.failSave(reason);
}
/// In-flight `/save` verification harness: the dry-run runtime the `run_script`
/// tool executes candidates on, plus the last source it ran (a fallback script
/// if the model finishes the loop without re-emitting it as text).
/// `/save` verification state: the runtime `run_script` executes candidates on,
/// and the last source that ran cleanly (the saved script if the model's final
/// message omits it).
const Verify = struct {
runtime: *ScriptRuntime,
last_source: ?[]const u8 = null,
/// A clean run whose bullet is held back until we know its verdict: yellow if
/// a re-run supersedes it, green if it's the one we keep.
pending_ok: bool = false,
};
/// Agent-only addendum (kept out of the shared `save_synthesis_prompt`) telling
@@ -997,8 +997,8 @@ const save_verify_addendum =
\\JavaScript source.
;
/// Cap on the captured extract sample shown in the synthesis prompt (the full
/// data still feeds the dry run); keeps a large result from dominating context.
/// Cap on the extract sample shown in the synthesis prompt, so a large result
/// doesn't dominate context.
const save_sample_cap = 8 * 1024;
/// LLM-synthesized `/save`. Pin the output shape first — derive the session's
@@ -1009,6 +1009,11 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8,
self.conversation.ensureSystemPrompt() catch return self.failSave("out of memory");
const baseline = self.conversation.messages.items.len;
// One spinner session for the save; cancel (not stop) leaves the phase steps
// without a per-turn "worked for" summary.
self.terminal.spinner.start();
defer self.terminal.spinner.cancel();
const anchor = prompt orelse self.one_shot_task;
const schema = self.deriveOutputSchema(arena, baseline, anchor);
if (self.cancel_requested.load(.acquire)) {
@@ -1019,18 +1024,20 @@ fn synthesizeSave(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8,
self.synthesizeScript(arena, filename, prompt, schema);
}
/// Steps 12 of `/save`: intent (over the session) → typed output schema. Both
/// turns leave the conversation as they found it; returns null if either turn
/// produced nothing usable (the caller then synthesizes without a schema).
/// Steps 12 of `/save`: session intent → typed output schema. Null if either
/// turn produced nothing usable (the caller then synthesizes without a schema).
fn deriveOutputSchema(self: *Agent, arena: std.mem.Allocator, baseline: usize, anchor: ?[]const u8) ?[]const u8 {
const intent = self.deriveIntent(arena, baseline, anchor) orelse return null;
self.terminal.agentStep("captured the intent");
if (self.cancel_requested.load(.acquire)) return null;
return self.deriveSchema(arena, intent);
const schema = self.deriveSchema(arena, intent) orelse return null;
self.terminal.agentStep("generated output schema");
return schema;
}
/// One-sentence intent from the session turns. Runs over the live conversation
/// (so the model sees the session) but rolls back to `baseline`, keeping the
/// turn out of history. An explicit anchor is folded in as authoritative.
/// One-sentence intent from the session. Runs over the live conversation then
/// rolls back to `baseline`, keeping the turn out of history; an explicit anchor
/// is authoritative.
fn deriveIntent(self: *Agent, arena: std.mem.Allocator, baseline: usize, anchor: ?[]const u8) ?[]const u8 {
const ma = self.conversation.arena.allocator();
var out: std.Io.Writer.Allocating = .init(ma);
@@ -1040,12 +1047,11 @@ fn deriveIntent(self: *Agent, arena: std.mem.Allocator, baseline: usize, anchor:
}
self.conversation.messages.append(self.allocator, .{ .role = .user, .content = out.written() }) catch return null;
defer self.conversation.rollback(baseline);
return self.runTextTurn(&self.conversation.messages, arena, self.allocator, ma, 512, "understanding the task");
return self.runTextTurn(&self.conversation.messages, arena, self.allocator, ma, 512, "capturing the intent");
}
/// Typed output schema from the intent. Runs over a throwaway message list
/// not the conversation so the schema is derived from the logical intent
/// alone, blind to the page structure and how the data was fetched.
/// Typed output schema from the intent. Runs over a throwaway message list (not
/// the conversation) so it's derived from the intent alone, blind to the page.
fn deriveSchema(self: *Agent, arena: std.mem.Allocator, intent: []const u8) ?[]const u8 {
var msgs: std.ArrayList(zenai.provider.Message) = .empty;
const msg = std.fmt.allocPrint(arena, "{s} {s}", .{ browser_tools.save_schema_prompt, intent }) catch return null;
@@ -1054,9 +1060,8 @@ fn deriveSchema(self: *Agent, arena: std.mem.Allocator, intent: []const u8) ?[]c
return string.stripCodeFence(raw);
}
/// Run a single no-tools text turn over `messages` and return the model's text
/// duped into `dest` (so it survives any rollback of `messages`), or null on
/// cancel, error, or empty output. Shared by the intent and schema steps.
/// Single no-tools text turn; returns the model's text duped into `dest` (so it
/// survives a rollback of `messages`), or null on cancel/error/empty output.
fn runTextTurn(
self: *Agent,
messages: *std.ArrayList(zenai.provider.Message),
@@ -1066,7 +1071,6 @@ fn runTextTurn(
max_tokens: i32,
status: []const u8,
) ?[]const u8 {
self.terminal.spinner.start();
self.terminal.spinner.setStatus(status);
var result = self.ai_client.?.runTools(
self.model,
@@ -1083,11 +1087,9 @@ fn runTextTurn(
.cancel = .{ .context = @ptrCast(self), .checkFn = checkCancel },
},
) catch |err| {
self.terminal.spinner.cancel();
if (!self.cancel_requested.load(.acquire)) log.err(.app, "AI save schema turn error", .{ .err = err });
return null;
};
self.terminal.spinner.stop();
defer result.deinit();
self.total_usage.add(result.usage);
if (result.cancelled) return null;
@@ -1096,9 +1098,8 @@ fn runTextTurn(
return dest.dupe(u8, text) catch null;
}
/// Step 3 of `/save`: hand the model the builtin catalog, the full conversation,
/// the deterministic record of what ran, and the required output schema, then
/// write the idiomatic script it returns.
/// Step 3 of `/save`: synthesize the script from the catalog, conversation,
/// recorded calls, and output schema, then write it.
fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u8, prompt: ?[]const u8, schema: ?[]const u8) void {
const provider_client = self.ai_client.?;
@@ -1110,13 +1111,12 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
const ma = self.conversation.arena.allocator();
const baseline = self.conversation.messages.items.len;
// When the session captured extract data, let the model test candidates on
// it via `run_script`; otherwise fall back to a single no-tools synthesis.
// With captured extract data, give the model `run_script` to test candidates;
// otherwise a single no-tools synthesis.
var verify: Verify = .{ .runtime = undefined };
var run_tools: [1]ProviderTool = undefined;
const verifying = blk: {
// Gate on a captured extract: it means the session loaded the page and
// left it in a state worth verifying against (and gives a prompt sample).
// A captured extract means there's a loaded page worth verifying against.
if (self.last_extract_json == null) break :blk false;
run_tools[0] = browser_tools.runScriptToolDef(ma) catch break :blk false;
const runtime = ScriptRuntime.init(self.allocator, self.browser.app, self.session, &self.node_registry) catch break :blk false;
@@ -1143,7 +1143,6 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
const user_msg = self.buildSaveSynthesisMessage(ma, prompt, schema, sample) catch return self.failSave("out of memory");
self.conversation.messages.append(self.allocator, .{ .role = .user, .content = user_msg }) catch return self.failSave("out of memory");
self.terminal.spinner.start();
self.terminal.spinner.setStatus(if (verifying) "writing and testing the script" else "writing the script");
var result = provider_client.runTools(
self.model,
@@ -1160,7 +1159,6 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
.cancel = .{ .context = @ptrCast(self), .checkFn = checkCancel },
},
) catch |err| {
self.terminal.spinner.cancel();
if (self.cancel_requested.load(.acquire)) {
self.resetAfterCancel(baseline);
return;
@@ -1168,7 +1166,6 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
log.err(.app, "AI save synthesis error", .{ .err = err });
return self.abortSave(baseline, @errorName(err));
};
self.terminal.spinner.stop();
defer result.deinit();
self.total_usage.add(result.usage);
@@ -1177,10 +1174,8 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
return;
}
// Prefer the last candidate that ran cleanly — it's verified, pure JS, with
// none of the commentary the model sometimes wraps its final message in. Fall
// back to the final text only when nothing ran (no extract data, or it never
// called run_script).
// Prefer the last cleanly-run candidate: verified, pure JS without the model's
// surrounding commentary. Fall back to the final text only when nothing ran.
const raw: []const u8 = blk: {
if (verifying) {
if (verify.last_source) |s| break :blk s;
@@ -1191,8 +1186,7 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
return self.abortSave(baseline, "the model returned no script");
};
// `raw` lives in the conversation arena, freed by the rollback below; copy
// into the command arena first (scrubbing may return its input as-is).
// `raw` is freed by the rollback below; copy into the command arena first.
const owned = arena.dupe(u8, string.stripCodeFence(raw)) catch return self.abortSave(baseline, "out of memory");
const script = browser_tools.reverseSubstituteEnvVars(arena, owned) catch return self.abortSave(baseline, "out of memory");
@@ -1206,38 +1200,50 @@ fn synthesizeScript(self: *Agent, arena: std.mem.Allocator, filename: ?[]const u
self.rememberSavePath(path);
self.resetSaveBuffers();
self.terminal.printInfo("Saved synthesized script to {s}", .{path});
self.flushPendingRun(.ok);
self.terminal.agentStep(std.fmt.allocPrint(arena, "saved synthesized script to {s}", .{path}) catch "saved synthesized script");
}
/// `run_script` tool handler: execute `source` on the dry-run runtime and hand
/// the model back the completion value (or the error), so it can judge and fix
/// its own script against real data.
/// Emit the clean run held back by `runScriptTool`, colored by `status` (warn if
/// a re-run superseded it, ok if it's the one we kept), at most once.
fn flushPendingRun(self: *Agent, status: Terminal.BulletStatus) void {
const verify = self.active_verify orelse return;
if (!verify.pending_ok) return;
verify.pending_ok = false;
self.terminal.agentVerifyRun("", status);
}
/// `run_script` handler: run the candidate live and return its completion value
/// (or error) to the model so it can judge and fix its own script.
fn runScriptTool(self: *Agent, allocator: std.mem.Allocator, arguments: ?std.json.Value) zenai.provider.Client.ToolHandler.Result {
const verify = self.active_verify.?;
// This call supersedes any clean run held back from the previous one.
self.flushPendingRun(.warn);
const args = browser_tools.parseArgsOrDefault(struct { source: []const u8 = "" }, allocator, arguments) catch
return .{ .content = "invalid run_script arguments", .is_error = true };
const source = args.source;
if (source.len == 0) return .{ .content = "run_script requires a non-empty \"source\" string", .is_error = true };
// Start each candidate from a blank page, exactly like a standalone replay
// so a script that forgets to goto(...) fails here instead of silently relying
// on the page the session left loaded.
// Blank page per candidate, like a standalone replay, so a script missing
// goto(...) fails here instead of using the page the session left loaded.
if (self.session.hasPage()) self.session.removePage();
const outcome = verify.runtime.runSourceCapture(source, "candidate.js") catch
return .{ .content = "out of memory running candidate", .is_error = true };
if (outcome.err) |e| {
self.terminal.agentVerifyRun(oneLinePreview(allocator, e, 120), false);
self.terminal.agentVerifyRun(oneLinePreview(allocator, e, 120), .fail);
return .{ .content = std.fmt.allocPrint(allocator, "Script threw: {s}", .{e}) catch "Script threw an error", .is_error = true };
}
// Keep the last source that ran cleanly — it's the verified, prose-free
// artifact `synthesizeScript` saves, instead of the model's final message
// (which may wrap the script in commentary).
// (which may wrap the script in commentary). Hold its bullet until we know
// whether the model keeps this run or tries another.
verify.last_source = self.conversation.arena.allocator().dupe(u8, source) catch source;
verify.pending_ok = true;
const body = if (outcome.output.len == 0) "(completion value is empty/undefined)" else outcome.output;
self.terminal.agentVerifyRun(oneLinePreview(allocator, body, 120), true);
const content = std.fmt.allocPrint(allocator, "Completion value:\n{s}", .{body}) catch body;
return .{ .content = string.truncateWithMarker(allocator, content, tool_output_max_bytes), .is_error = false };
}

View File

@@ -302,10 +302,12 @@ fn renderLocked(self: *Spinner) void {
const cap = @min(max_args_cells, room);
const cut = truncToCells(all_args, cap);
const suffix: []const u8 = if (cut < all_args.len) ellipsis else "";
// No space between name and args when there are none (e.g. run_script).
const sep: []const u8 = if (cut == 0) "" else " ";
break :blk std.fmt.bufPrint(
&buf,
"\r" ++ ansi.yellow ++ "{s}" ++ ansi.reset ++ " " ++ ansi.dim ++ "[{s}{s} {s}{s}]" ++ ansi.reset ++ clear_eol,
.{ glyph, prefix, name, all_args[0..cut], suffix },
"\r" ++ ansi.yellow ++ "{s}" ++ ansi.reset ++ " " ++ ansi.dim ++ "[{s}{s}{s}{s}{s}]" ++ ansi.reset ++ clear_eol,
.{ glyph, prefix, name, sep, all_args[0..cut], suffix },
) catch return;
},
};

View File

@@ -193,8 +193,6 @@ pub fn deinit(self: *Terminal) void {
if (self.repl_arena) |*a| a.deinit();
}
const bullet_line_fmt = "{s}●{s} {s}[tool: {s}]{s} {s}\n";
/// Mark the start of a manual REPL tool call. Pairs with `endTool`.
pub fn beginTool(self: *Terminal, name: []const u8, args: []const u8) void {
self.spinner.setTool(name, args);
@@ -206,47 +204,64 @@ pub fn endTool(self: *Terminal) void {
self.spinner.cancel();
}
/// Called after the tool returns. At `medium`+, commits a `● [tool: …]` line
/// above the spinner (green/red bullet for ok/fail) so the run leaves a trace.
/// ANSI is emitted even in non-TTY contexts — pipes that strip color see plain
/// text via the bullet character.
/// Bullet color for a committed `●` line: ok=green, warn=yellow, fail=red.
pub const BulletStatus = enum {
ok,
warn,
fail,
fn color(self: BulletStatus) []const u8 {
return switch (self) {
.ok => ansi.green,
.warn => ansi.yellow,
.fail => ansi.red,
};
}
};
/// A completed step in a multi-phase agent operation (e.g. `/save`'s "captured
/// the intent"). Committed above the spinner at any verbosity.
pub fn agentStep(self: *Terminal, text: []const u8) void {
self.emitBullet(.ok, "{s}{s}{s}", .{ ansi.dim, text, ansi.reset });
}
/// Called after a tool returns. At `medium`+, commits a `● [tool: …]` line above
/// the spinner (green/red bullet for ok/fail) so the run leaves a trace.
pub fn agentToolDone(self: *Terminal, name: []const u8, args: []const u8, ok: bool) void {
if (!self.verbosity.atLeast(.medium)) return;
self.emitToolBullet(name, args, ok);
self.emitToolBullet(name, args, if (ok) .ok else .fail);
}
/// Trace one `/save` candidate run. Unlike `agentToolDone` this is shown even at
/// the REPL's default `.low` verbosity: the verify loop is an infrequent,
/// user-initiated step the user needs to watch happen.
pub fn agentVerifyRun(self: *Terminal, summary: []const u8, ok: bool) void {
self.emitToolBullet("run_script", summary, ok);
/// Trace one `/save` candidate run — shown even at the REPL's default `.low`
/// verbosity. `detail` carries the error on failure, empty otherwise. `status`:
/// ok=the kept run, warn=superseded by a re-run, fail=errored.
pub fn agentVerifyRun(self: *Terminal, detail: []const u8, status: BulletStatus) void {
self.emitToolBullet("run_script", detail, status);
}
fn emitToolBullet(self: *Terminal, name: []const u8, args: []const u8, ok: bool) void {
fn emitToolBullet(self: *Terminal, name: []const u8, args: []const u8, status: BulletStatus) void {
if (args.len == 0)
self.emitBullet(status, "{s}[tool: {s}]{s}", .{ ansi.dim, name, ansi.reset })
else
self.emitBullet(status, "{s}[tool: {s}]{s} {s}", .{ ansi.dim, name, ansi.reset, args });
}
/// Commit a `● <body>` line above the spinner (or to stderr when it's off);
/// `status` colors the bullet, which doubles as a plain-text marker for pipes.
/// Shared by phase steps and tool-call traces.
fn emitBullet(self: *Terminal, status: BulletStatus, comptime fmt: []const u8, args: anytype) void {
const bullet = status.color();
if (self.spinner.isEnabled()) {
const a = if (self.repl_arena) |*ra| ra else return;
defer _ = a.reset(.retain_capacity);
const bytes = formatBulletLine(a.allocator(), name, args, ok) catch return;
_ = self.spinner.emitAbove(bytes);
const body = std.fmt.allocPrint(a.allocator(), fmt, args) catch return;
const line = std.fmt.allocPrint(a.allocator(), "{s}●{s} {s}\n", .{ bullet, ansi.reset, body }) catch return;
_ = self.spinner.emitAbove(line);
return;
}
if (self.stderr_is_tty) {
const bullet_color = if (ok) ansi.green else ansi.red;
std.debug.print(bullet_line_fmt, .{ bullet_color, ansi.reset, ansi.dim, name, ansi.reset, args });
} else {
std.debug.print(
"{s}{s}[tool: {s}]{s} {s}\n",
.{ ansi.dim, ansi.cyan, name, ansi.reset, args },
);
}
}
fn formatBulletLine(arena: std.mem.Allocator, name: []const u8, args: []const u8, ok: bool) ![]const u8 {
var aw: std.Io.Writer.Allocating = .init(arena);
const w = &aw.writer;
const bullet_color = if (ok) ansi.green else ansi.red;
try w.print(bullet_line_fmt, .{ bullet_color, ansi.reset, ansi.dim, name, ansi.reset, args });
return aw.written();
std.debug.print("{s}●{s} ", .{ bullet, ansi.reset });
std.debug.print(fmt, args);
std.debug.print("\n", .{});
}
const completion_buf_len = 512;