From cf65a00a8fe0fd40580a243add27528127d6a18e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Arrufat?= Date: Sun, 31 May 2026 17:07:42 +0200 Subject: [PATCH] extract: add declarative follow option Allows fetching sub-pages per row and resolving nested fields against them. Supports string templates with sibling placeholders (e.g., `{id}`) and element-specs. Updates the JS walker to be async. --- docs/agent.md | 63 ++++++++++++++++++++++++------------ src/browser/tools.zig | 50 ++++++++++++++++++++++------- src/mcp/tools.zig | 75 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 156 insertions(+), 32 deletions(-) diff --git a/docs/agent.md b/docs/agent.md index 193b8dbe..b5945371 100644 --- a/docs/agent.md +++ b/docs/agent.md @@ -144,6 +144,15 @@ as a single JSON object. Supported value forms: - Add `"limit": N` inside any array's object spec to cap matches at N (works for text, attribute, and `fields` shapes — e.g. `[{"selector": ".story .title", "limit": 5}]` for top 5 titles). +- Add `"follow": ` to a spec to fetch a per-row sub-page and resolve + the spec's `selector`/`limit`/`fields` against *that* document instead + of the current element — a declarative "scrape a list, then visit each + row." `` is either a string template whose `{name}` placeholders + fill from sibling fields on the same row (`"/item?id={id}"`), or an + element-spec read off the row (`{"selector": "a.comments", "attr": + "href"}`). Fetches resolve relative to the current page and run + sequentially; a failed fetch yields `null` (or `[]`) for that field. + See the worked example below. Use `/extract '''…'''` (or `"""…"""`) to spread a schema across multiple lines. The schema is parsed in Zig before the page-side walker runs, @@ -192,32 +201,46 @@ the end of the call. Adding a key (`lp.x = …`), updating a nested value update — even after a navigation, because the store lives Session-side, not on the page. -**Async eval.** Top-level `await` works directly — the body runs as an -async function, so use `return` to produce a value. `runEval` pumps the -event loop until it settles, then surfaces the resolved value (or the -rejection as an error). Combined with the bridge this lets a single -`/eval` do an async `fetch` loop over `lp.*` data: +**List → detail with `follow`.** A common scrape captures a list, then +visits each row for more data. `/extract`'s `follow` does that in one +declarative call — no `lp.*` round-trip, no hand-written loop. The HN +front page plus the top comments of each story: ```pandascript -/eval ''' -for (const s of lp.front.stories) { - const html = await fetch('/item?id=' + s.id).then(r => r.text()); - const doc = new DOMParser().parseFromString(html, 'text/html'); - s.comments = [...doc.querySelectorAll('tr.athing.comtr')].slice(0, 3) - .map(r => r.querySelector('.commtext')?.textContent.trim()) - .filter(Boolean); -} -''' +/goto 'https://news.ycombinator.com/' -/eval ''' -lp.front.stories +/extract ''' +{ + "stories": [{ + "selector": "tr.athing", + "limit": 5, + "fields": { + "id": {"attr": "id"}, + "title": ".titleline > a", + "comments": [{ + "follow": "/item?id={id}", + "selector": "tr.athing.comtr:has(td.ind img[width=\"0\"]):has(.commtext)", + "limit": 3, + "fields": {"author": ".hnuser", "text": ".commtext"} + }] + } + }] +} ''' ``` -A body with no explicit `return` resolves to `undefined`, which the eval -treats as silent — so the loop above prints nothing. The final `/eval` -yields the array, which lands on stdout as JSON: objects and arrays are -serialized automatically, so no `JSON.stringify` is needed. +`{id}` fills from each story's `id` field; the walker fetches +`/item?id=`, parses it, and resolves the inner `selector`/`fields` +against that page. The whole nested result prints to stdout as one JSON +object. + +**Async eval.** When a scrape needs logic `follow` can't express, `/eval` +is the escape hatch: top-level `await` works directly — the body runs as +an async function, so use `return` to produce a value. `runEval` pumps +the event loop until it settles, then surfaces the resolved value (or the +rejection as an error). A body with no explicit `return` resolves to +`undefined`, which the eval treats as silent. Returned objects and arrays +are serialized to JSON automatically, so no `JSON.stringify` is needed. The store is **script-run scoped**: it's bound to the Session that runs the script, and goes away when that Session does. There is no diff --git a/src/browser/tools.zig b/src/browser/tools.zig index d005e22f..f52a438c 100644 --- a/src/browser/tools.zig +++ b/src/browser/tools.zig @@ -218,6 +218,7 @@ pub const Tool = enum { \\ [{"selector":"","attr":""}] → every match's attribute (string[]) \\ [{"selector":"","fields":{…}}] → array of objects, fields resolved relative to each match \\ add `"limit": N` inside any array's object spec to cap matches at N (works for text, attr, and fields shapes) + \\ add `"follow": ` to a spec to fetch a per-row sub-page and resolve `selector`/`limit`/`fields` against it instead of the current element. `` is either a string template — `{name}` placeholders fill from sibling fields on the same row, e.g. "/item?id={id}" — or an element-spec `{"selector":"","attr":"href"}` read off the row. Fetches run sequentially; a failed fetch yields null/[] for that field. \\ \\Examples (schema → result): \\ {"karma": "#karma"} → {"karma":"42"} @@ -225,6 +226,7 @@ pub const Tool = enum { \\ {"top3": [{"selector":".story .title","limit":3}]} → {"top3":["A","B","C"]} \\ {"links": [{"selector":"a.title","attr":"href"}]} → {"links":["/a","/b"]} \\ {"stories": [{"selector":".athing","fields":{"title":".titleline","rank":".rank"}}]} → {"stories":[{"title":"Foo","rank":"1"}]} + \\ {"stories": [{"selector":"tr.athing","limit":5,"fields":{"id":{"attr":"id"},"comments":[{"follow":"/item?id={id}","selector":".comment","limit":3,"fields":{"author":".hnuser","text":".commtext"}}]}}]} → {"stories":[{"id":"1","comments":[{"author":"foo","text":"hi"}]}]} , .summary = "Extract structured data via a JSON schema", .input_schema = minify( @@ -682,19 +684,37 @@ pub fn extract( } // The schema literal is spliced between prefix and suffix verbatim — a format -// string here would collide with the `{`/`}` throughout the walker body. +// string here would collide with the `{`/`}` throughout the walker body. The +// walker is async (and returns a Promise that `runEval` pumps) because a +// spec's `follow` key fetches a per-row sub-page; the `await` chain resolves +// synchronously when no spec follows, so non-follow schemas pay no I/O. const schema_walker_prefix = - \\JSON.stringify((function(schema){ - \\ function valueOf(m, inner){ + \\(async function(schema){ + \\ function fill(t, ctx){ return t.replace(/\{(\w+)\}/g, function(_, n){ return (ctx && ctx[n] != null) ? ctx[n] : ''; }); } + \\ function hasFollow(v){ const s = Array.isArray(v) ? v[0] : v; return s && typeof s === 'object' && s.follow !== undefined; } + \\ async function followRoot(el, follow, ctx){ + \\ const url = (typeof follow === 'string') ? fill(follow, ctx) : await ext(el, follow); + \\ if (!url) return null; + \\ try { + \\ const html = await fetch(url).then(function(r){ return r.text(); }); + \\ return new DOMParser().parseFromString(html, 'text/html'); + \\ } catch (e) { return null; } + \\ } + \\ async function rootFor(el, spec, ctx){ return spec.follow !== undefined ? await followRoot(el, spec.follow, ctx) : el; } + \\ async function valueOf(m, inner){ \\ if (inner.fields) { \\ const r = {}; - \\ for (const k in inner.fields) r[k] = ext(m, inner.fields[k]); + \\ const deferred = []; + \\ for (const k in inner.fields) { + \\ if (hasFollow(inner.fields[k])) deferred.push(k); else r[k] = await ext(m, inner.fields[k]); + \\ } + \\ for (const k of deferred) r[k] = await ext(m, inner.fields[k], r); \\ return r; \\ } \\ if (inner.attr) return m.getAttribute(inner.attr); \\ return m.textContent.trim(); \\ } - \\ function ext(el, v){ + \\ async function ext(el, v, ctx){ \\ if (typeof v === 'string') { \\ if (v === '') return el.textContent.trim(); \\ const m = el.querySelector(v); @@ -702,26 +722,32 @@ const schema_walker_prefix = \\ } \\ if (Array.isArray(v)) { \\ const inner = typeof v[0] === 'string' ? { selector: v[0] } : v[0]; - \\ let matches = Array.from(el.querySelectorAll(inner.selector)); + \\ const root = await rootFor(el, inner, ctx); + \\ if (!root) return []; + \\ let matches = Array.from(root.querySelectorAll(inner.selector)); \\ if (typeof inner.limit === 'number') matches = matches.slice(0, inner.limit); - \\ return matches.map(function(m){ return valueOf(m, inner); }); + \\ const acc = []; + \\ for (const m of matches) acc.push(await valueOf(m, inner)); + \\ return acc; \\ } - \\ const t = v.selector ? el.querySelector(v.selector) : el; + \\ const root = await rootFor(el, v, ctx); + \\ if (!root) return null; + \\ const t = v.selector ? root.querySelector(v.selector) : root; \\ if (!t) return null; - \\ return valueOf(t, v); + \\ return await valueOf(t, v); \\ } \\ const out = {}; \\ let any = false; \\ for (const k in schema) { - \\ out[k] = ext(document, schema[k]); + \\ out[k] = await ext(document, schema[k]); \\ const v = out[k]; \\ if (v !== null && !(Array.isArray(v) && v.length === 0)) any = true; \\ } \\ if (!any) throw new Error("extract: no schema selector matched any element — inspect the page with tree/markdown and retry with corrected selectors"); - \\ return out; + \\ return JSON.stringify(out); \\})( ; -const schema_walker_suffix = "))"; +const schema_walker_suffix = ")"; fn execGoto(arena: std.mem.Allocator, session: *lp.Session, registry: *CDPNode.Registry, arguments: ?std.json.Value) ToolError![]const u8 { const args = try parseArgs(GotoParams, arena, arguments); diff --git a/src/mcp/tools.zig b/src/mcp/tools.zig index 89104fd9..a4f5b1eb 100644 --- a/src/mcp/tools.zig +++ b/src/mcp/tools.zig @@ -954,6 +954,81 @@ test "MCP - extract: save= exposes the result as lp." { } }, out.written()); } +test "MCP - extract: follow string template fetches a sub-page and nests fields" { + defer testing.reset(); + var out: std.io.Writer.Allocating = .init(testing.arena_allocator); + const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer); + defer server.deinit(); + + const msg = + \\{ + \\ "jsonrpc": "2.0", + \\ "id": 1, + \\ "method": "tools/call", + \\ "params": { + \\ "name": "extract", + \\ "arguments": { + \\ "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"id\":{\"attr\":\"id\"},\"comments\":[{\"follow\":\"/src/browser/tests/extract_follow_item.html?id={id}\",\"selector\":\".comment\",\"fields\":{\"author\":\".author\",\"text\":\".text\"}}]}}]}" + \\ } + \\ } + \\} + ; + try router.handleMessage(server, testing.arena_allocator, msg); + try testing.expectJson(.{ .id = 1, .result = .{ + .content = &.{.{ .type = "text", .text = "{\"rows\":[{\"id\":\"1\",\"comments\":[{\"author\":\"alice\",\"text\":\"hello\"},{\"author\":\"bob\",\"text\":\"world\"}]},{\"id\":\"2\",\"comments\":[{\"author\":\"alice\",\"text\":\"hello\"},{\"author\":\"bob\",\"text\":\"world\"}]}]}" }}, + } }, out.written()); +} + +test "MCP - extract: follow href-spec resolves a link off the row and nests fields" { + defer testing.reset(); + var out: std.io.Writer.Allocating = .init(testing.arena_allocator); + const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer); + defer server.deinit(); + + const msg = + \\{ + \\ "jsonrpc": "2.0", + \\ "id": 1, + \\ "method": "tools/call", + \\ "params": { + \\ "name": "extract", + \\ "arguments": { + \\ "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"comments\":[{\"follow\":{\"selector\":\"a.link\",\"attr\":\"href\"},\"selector\":\".comment\",\"limit\":1,\"fields\":{\"author\":\".author\"}}]}}]}" + \\ } + \\ } + \\} + ; + try router.handleMessage(server, testing.arena_allocator, msg); + try testing.expectJson(.{ .id = 1, .result = .{ + .content = &.{.{ .type = "text", .text = "{\"rows\":[{\"comments\":[{\"author\":\"alice\"}]},{\"comments\":[{\"author\":\"alice\"}]}]}" }}, + } }, out.written()); +} + +test "MCP - extract: follow to a missing page yields [] without failing the extract" { + defer testing.reset(); + var out: std.io.Writer.Allocating = .init(testing.arena_allocator); + const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer); + defer server.deinit(); + + const msg = + \\{ + \\ "jsonrpc": "2.0", + \\ "id": 1, + \\ "method": "tools/call", + \\ "params": { + \\ "name": "extract", + \\ "arguments": { + \\ "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"id\":{\"attr\":\"id\"},\"comments\":[{\"follow\":\"/src/browser/tests/does_not_exist.html\",\"selector\":\".comment\",\"fields\":{\"author\":\".author\"}}]}}]}" + \\ } + \\ } + \\} + ; + try router.handleMessage(server, testing.arena_allocator, msg); + try testing.expectJson(.{ .id = 1, .result = .{ + .content = &.{.{ .type = "text", .text = "{\"rows\":[{\"id\":\"1\",\"comments\":[]},{\"id\":\"2\",\"comments\":[]}]}" }}, + } }, out.written()); +} + test "MCP - eval: Promise.resolve return value is awaited" { defer testing.reset(); var out: std.io.Writer.Allocating = .init(testing.arena_allocator);