extract: add declarative follow option

Allows fetching sub-pages per row and resolving nested fields against them. Supports string templates with sibling placeholders (e.g., `{id}`) and element-specs. Updates the JS walker to be async.
2026-07-30 09:16:07 -04:00 · 2026-05-31 17:07:42 +02:00
parent 85facd2fc7
commit cf65a00a8f
3 changed files with 156 additions and 32 deletions
--- a/docs/agent.md
+++ b/docs/agent.md
@@ -144,6 +144,15 @@ as a single JSON object. Supported value forms:
 - Add `"limit": N` inside any array's object spec to cap matches at N
  (works for text, attribute, and `fields` shapes — e.g.
  `[{"selector": ".story .title", "limit": 5}]` for top 5 titles).
+- Add `"follow": <url>` to a spec to fetch a per-row sub-page and resolve
+  the spec's `selector`/`limit`/`fields` against *that* document instead
+  of the current element — a declarative "scrape a list, then visit each
+  row." `<url>` is either a string template whose `{name}` placeholders
+  fill from sibling fields on the same row (`"/item?id={id}"`), or an
+  element-spec read off the row (`{"selector": "a.comments", "attr":
+  "href"}`). Fetches resolve relative to the current page and run
+  sequentially; a failed fetch yields `null` (or `[]`) for that field.
+  See the worked example below.

 Use `/extract '''…'''` (or `"""…"""`) to spread a schema across multiple
 lines. The schema is parsed in Zig before the page-side walker runs,
@@ -192,32 +201,46 @@ the end of the call. Adding a key (`lp.x = …`), updating a nested value
 update — even after a navigation, because the store lives Session-side,
 not on the page.

-**Async eval.** Top-level `await` works directly — the body runs as an
-async function, so use `return` to produce a value. `runEval` pumps the
-event loop until it settles, then surfaces the resolved value (or the
-rejection as an error). Combined with the bridge this lets a single
-`/eval` do an async `fetch` loop over `lp.*` data:
+**List → detail with `follow`.** A common scrape captures a list, then
+visits each row for more data. `/extract`'s `follow` does that in one
+declarative call — no `lp.*` round-trip, no hand-written loop. The HN
+front page plus the top comments of each story:

 ```pandascript
-/eval '''
-for (const s of lp.front.stories) {
-  const html = await fetch('/item?id=' + s.id).then(r => r.text());
-  const doc = new DOMParser().parseFromString(html, 'text/html');
-  s.comments = [...doc.querySelectorAll('tr.athing.comtr')].slice(0, 3)
-    .map(r => r.querySelector('.commtext')?.textContent.trim())
-    .filter(Boolean);
-}
-'''
+/goto 'https://news.ycombinator.com/'

-/eval '''
-lp.front.stories
+/extract '''
+{
+  "stories": [{
+    "selector": "tr.athing",
+    "limit": 5,
+    "fields": {
+      "id":    {"attr": "id"},
+      "title": ".titleline > a",
+      "comments": [{
+        "follow": "/item?id={id}",
+        "selector": "tr.athing.comtr:has(td.ind img[width=\"0\"]):has(.commtext)",
+        "limit": 3,
+        "fields": {"author": ".hnuser", "text": ".commtext"}
+      }]
+    }
+  }]
+}
 '''
 ```

-A body with no explicit `return` resolves to `undefined`, which the eval
-treats as silent — so the loop above prints nothing. The final `/eval`
-yields the array, which lands on stdout as JSON: objects and arrays are
-serialized automatically, so no `JSON.stringify` is needed.
+`{id}` fills from each story's `id` field; the walker fetches
+`/item?id=<id>`, parses it, and resolves the inner `selector`/`fields`
+against that page. The whole nested result prints to stdout as one JSON
+object.
+
+**Async eval.** When a scrape needs logic `follow` can't express, `/eval`
+is the escape hatch: top-level `await` works directly — the body runs as
+an async function, so use `return` to produce a value. `runEval` pumps
+the event loop until it settles, then surfaces the resolved value (or the
+rejection as an error). A body with no explicit `return` resolves to
+`undefined`, which the eval treats as silent. Returned objects and arrays
+are serialized to JSON automatically, so no `JSON.stringify` is needed.

 The store is **script-run scoped**: it's bound to the Session that runs
 the script, and goes away when that Session does. There is no
--- a/src/browser/tools.zig
+++ b/src/browser/tools.zig
@@ -218,6 +218,7 @@ pub const Tool = enum {
                \\  [{"selector":"<sel>","attr":"<name>"}] → every match's attribute (string[])
                \\  [{"selector":"<sel>","fields":{…}}]    → array of objects, fields resolved relative to each match
                \\  add `"limit": N` inside any array's object spec to cap matches at N (works for text, attr, and fields shapes)
+                \\  add `"follow": <url>` to a spec to fetch a per-row sub-page and resolve `selector`/`limit`/`fields` against it instead of the current element. `<url>` is either a string template — `{name}` placeholders fill from sibling fields on the same row, e.g. "/item?id={id}" — or an element-spec `{"selector":"<sel>","attr":"href"}` read off the row. Fetches run sequentially; a failed fetch yields null/[] for that field.
                \\
                \\Examples (schema → result):
                \\  {"karma": "#karma"} → {"karma":"42"}
@@ -225,6 +226,7 @@ pub const Tool = enum {
                \\  {"top3": [{"selector":".story .title","limit":3}]} → {"top3":["A","B","C"]}
                \\  {"links": [{"selector":"a.title","attr":"href"}]} → {"links":["/a","/b"]}
                \\  {"stories": [{"selector":".athing","fields":{"title":".titleline","rank":".rank"}}]} → {"stories":[{"title":"Foo","rank":"1"}]}
+                \\  {"stories": [{"selector":"tr.athing","limit":5,"fields":{"id":{"attr":"id"},"comments":[{"follow":"/item?id={id}","selector":".comment","limit":3,"fields":{"author":".hnuser","text":".commtext"}}]}}]} → {"stories":[{"id":"1","comments":[{"author":"foo","text":"hi"}]}]}
                ,
                .summary = "Extract structured data via a JSON schema",
                .input_schema = minify(
@@ -682,19 +684,37 @@ pub fn extract(
 }

 // The schema literal is spliced between prefix and suffix verbatim — a format
-// string here would collide with the `{`/`}` throughout the walker body.
+// string here would collide with the `{`/`}` throughout the walker body. The
+// walker is async (and returns a Promise<string> that `runEval` pumps) because a
+// spec's `follow` key fetches a per-row sub-page; the `await` chain resolves
+// synchronously when no spec follows, so non-follow schemas pay no I/O.
 const schema_walker_prefix =
-    \\JSON.stringify((function(schema){
-    \\  function valueOf(m, inner){
+    \\(async function(schema){
+    \\  function fill(t, ctx){ return t.replace(/\{(\w+)\}/g, function(_, n){ return (ctx && ctx[n] != null) ? ctx[n] : ''; }); }
+    \\  function hasFollow(v){ const s = Array.isArray(v) ? v[0] : v; return s && typeof s === 'object' && s.follow !== undefined; }
+    \\  async function followRoot(el, follow, ctx){
+    \\    const url = (typeof follow === 'string') ? fill(follow, ctx) : await ext(el, follow);
+    \\    if (!url) return null;
+    \\    try {
+    \\      const html = await fetch(url).then(function(r){ return r.text(); });
+    \\      return new DOMParser().parseFromString(html, 'text/html');
+    \\    } catch (e) { return null; }
+    \\  }
+    \\  async function rootFor(el, spec, ctx){ return spec.follow !== undefined ? await followRoot(el, spec.follow, ctx) : el; }
+    \\  async function valueOf(m, inner){
    \\    if (inner.fields) {
    \\      const r = {};
-    \\      for (const k in inner.fields) r[k] = ext(m, inner.fields[k]);
+    \\      const deferred = [];
+    \\      for (const k in inner.fields) {
+    \\        if (hasFollow(inner.fields[k])) deferred.push(k); else r[k] = await ext(m, inner.fields[k]);
+    \\      }
+    \\      for (const k of deferred) r[k] = await ext(m, inner.fields[k], r);
    \\      return r;
    \\    }
    \\    if (inner.attr) return m.getAttribute(inner.attr);
    \\    return m.textContent.trim();
    \\  }
-    \\  function ext(el, v){
+    \\  async function ext(el, v, ctx){
    \\    if (typeof v === 'string') {
    \\      if (v === '') return el.textContent.trim();
    \\      const m = el.querySelector(v);
@@ -702,26 +722,32 @@ const schema_walker_prefix =
    \\    }
    \\    if (Array.isArray(v)) {
    \\      const inner = typeof v[0] === 'string' ? { selector: v[0] } : v[0];
-    \\      let matches = Array.from(el.querySelectorAll(inner.selector));
+    \\      const root = await rootFor(el, inner, ctx);
+    \\      if (!root) return [];
+    \\      let matches = Array.from(root.querySelectorAll(inner.selector));
    \\      if (typeof inner.limit === 'number') matches = matches.slice(0, inner.limit);
-    \\      return matches.map(function(m){ return valueOf(m, inner); });
+    \\      const acc = [];
+    \\      for (const m of matches) acc.push(await valueOf(m, inner));
+    \\      return acc;
    \\    }
-    \\    const t = v.selector ? el.querySelector(v.selector) : el;
+    \\    const root = await rootFor(el, v, ctx);
+    \\    if (!root) return null;
+    \\    const t = v.selector ? root.querySelector(v.selector) : root;
    \\    if (!t) return null;
-    \\    return valueOf(t, v);
+    \\    return await valueOf(t, v);
    \\  }
    \\  const out = {};
    \\  let any = false;
    \\  for (const k in schema) {
-    \\    out[k] = ext(document, schema[k]);
+    \\    out[k] = await ext(document, schema[k]);
    \\    const v = out[k];
    \\    if (v !== null && !(Array.isArray(v) && v.length === 0)) any = true;
    \\  }
    \\  if (!any) throw new Error("extract: no schema selector matched any element — inspect the page with tree/markdown and retry with corrected selectors");
-    \\  return out;
+    \\  return JSON.stringify(out);
    \\})(
 ;
-const schema_walker_suffix = "))";
+const schema_walker_suffix = ")";

 fn execGoto(arena: std.mem.Allocator, session: *lp.Session, registry: *CDPNode.Registry, arguments: ?std.json.Value) ToolError![]const u8 {
    const args = try parseArgs(GotoParams, arena, arguments);
--- a/src/mcp/tools.zig
+++ b/src/mcp/tools.zig
@@ -954,6 +954,81 @@ test "MCP - extract: save= exposes the result as lp.<name>" {
    } }, out.written());
 }

+test "MCP - extract: follow string template fetches a sub-page and nests fields" {
+    defer testing.reset();
+    var out: std.io.Writer.Allocating = .init(testing.arena_allocator);
+    const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer);
+    defer server.deinit();
+
+    const msg =
+        \\{
+        \\  "jsonrpc": "2.0",
+        \\  "id": 1,
+        \\  "method": "tools/call",
+        \\  "params": {
+        \\    "name": "extract",
+        \\    "arguments": {
+        \\      "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"id\":{\"attr\":\"id\"},\"comments\":[{\"follow\":\"/src/browser/tests/extract_follow_item.html?id={id}\",\"selector\":\".comment\",\"fields\":{\"author\":\".author\",\"text\":\".text\"}}]}}]}"
+        \\    }
+        \\  }
+        \\}
+    ;
+    try router.handleMessage(server, testing.arena_allocator, msg);
+    try testing.expectJson(.{ .id = 1, .result = .{
+        .content = &.{.{ .type = "text", .text = "{\"rows\":[{\"id\":\"1\",\"comments\":[{\"author\":\"alice\",\"text\":\"hello\"},{\"author\":\"bob\",\"text\":\"world\"}]},{\"id\":\"2\",\"comments\":[{\"author\":\"alice\",\"text\":\"hello\"},{\"author\":\"bob\",\"text\":\"world\"}]}]}" }},
+    } }, out.written());
+}
+
+test "MCP - extract: follow href-spec resolves a link off the row and nests fields" {
+    defer testing.reset();
+    var out: std.io.Writer.Allocating = .init(testing.arena_allocator);
+    const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer);
+    defer server.deinit();
+
+    const msg =
+        \\{
+        \\  "jsonrpc": "2.0",
+        \\  "id": 1,
+        \\  "method": "tools/call",
+        \\  "params": {
+        \\    "name": "extract",
+        \\    "arguments": {
+        \\      "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"comments\":[{\"follow\":{\"selector\":\"a.link\",\"attr\":\"href\"},\"selector\":\".comment\",\"limit\":1,\"fields\":{\"author\":\".author\"}}]}}]}"
+        \\    }
+        \\  }
+        \\}
+    ;
+    try router.handleMessage(server, testing.arena_allocator, msg);
+    try testing.expectJson(.{ .id = 1, .result = .{
+        .content = &.{.{ .type = "text", .text = "{\"rows\":[{\"comments\":[{\"author\":\"alice\"}]},{\"comments\":[{\"author\":\"alice\"}]}]}" }},
+    } }, out.written());
+}
+
+test "MCP - extract: follow to a missing page yields [] without failing the extract" {
+    defer testing.reset();
+    var out: std.io.Writer.Allocating = .init(testing.arena_allocator);
+    const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer);
+    defer server.deinit();
+
+    const msg =
+        \\{
+        \\  "jsonrpc": "2.0",
+        \\  "id": 1,
+        \\  "method": "tools/call",
+        \\  "params": {
+        \\    "name": "extract",
+        \\    "arguments": {
+        \\      "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"id\":{\"attr\":\"id\"},\"comments\":[{\"follow\":\"/src/browser/tests/does_not_exist.html\",\"selector\":\".comment\",\"fields\":{\"author\":\".author\"}}]}}]}"
+        \\    }
+        \\  }
+        \\}
+    ;
+    try router.handleMessage(server, testing.arena_allocator, msg);
+    try testing.expectJson(.{ .id = 1, .result = .{
+        .content = &.{.{ .type = "text", .text = "{\"rows\":[{\"id\":\"1\",\"comments\":[]},{\"id\":\"2\",\"comments\":[]}]}" }},
+    } }, out.written());
+}
+
 test "MCP - eval: Promise.resolve return value is awaited" {
    defer testing.reset();
    var out: std.io.Writer.Allocating = .init(testing.arena_allocator);