extract: add declarative follow option

Allows fetching sub-pages per row and resolving nested fields against
them. Supports string templates with sibling placeholders (e.g., `{id}`)
and element-specs. Updates the JS walker to be async.
This commit is contained in:
Adrià Arrufat
2026-05-31 17:07:42 +02:00
parent 85facd2fc7
commit cf65a00a8f
3 changed files with 156 additions and 32 deletions

View File

@@ -144,6 +144,15 @@ as a single JSON object. Supported value forms:
- Add `"limit": N` inside any array's object spec to cap matches at N
(works for text, attribute, and `fields` shapes — e.g.
`[{"selector": ".story .title", "limit": 5}]` for top 5 titles).
- Add `"follow": <url>` to a spec to fetch a per-row sub-page and resolve
the spec's `selector`/`limit`/`fields` against *that* document instead
of the current element — a declarative "scrape a list, then visit each
row." `<url>` is either a string template whose `{name}` placeholders
fill from sibling fields on the same row (`"/item?id={id}"`), or an
element-spec read off the row (`{"selector": "a.comments", "attr":
"href"}`). Fetches resolve relative to the current page and run
sequentially; a failed fetch yields `null` (or `[]`) for that field.
See the worked example below.
Use `/extract '''…'''` (or `"""…"""`) to spread a schema across multiple
lines. The schema is parsed in Zig before the page-side walker runs,
@@ -192,32 +201,46 @@ the end of the call. Adding a key (`lp.x = …`), updating a nested value
update — even after a navigation, because the store lives Session-side,
not on the page.
**Async eval.** Top-level `await` works directly — the body runs as an
async function, so use `return` to produce a value. `runEval` pumps the
event loop until it settles, then surfaces the resolved value (or the
rejection as an error). Combined with the bridge this lets a single
`/eval` do an async `fetch` loop over `lp.*` data:
**List → detail with `follow`.** A common scrape captures a list, then
visits each row for more data. `/extract`'s `follow` does that in one
declarative call — no `lp.*` round-trip, no hand-written loop. The HN
front page plus the top comments of each story:
```pandascript
/eval '''
for (const s of lp.front.stories) {
const html = await fetch('/item?id=' + s.id).then(r => r.text());
const doc = new DOMParser().parseFromString(html, 'text/html');
s.comments = [...doc.querySelectorAll('tr.athing.comtr')].slice(0, 3)
.map(r => r.querySelector('.commtext')?.textContent.trim())
.filter(Boolean);
}
'''
/goto 'https://news.ycombinator.com/'
/eval '''
lp.front.stories
/extract '''
{
"stories": [{
"selector": "tr.athing",
"limit": 5,
"fields": {
"id": {"attr": "id"},
"title": ".titleline > a",
"comments": [{
"follow": "/item?id={id}",
"selector": "tr.athing.comtr:has(td.ind img[width=\"0\"]):has(.commtext)",
"limit": 3,
"fields": {"author": ".hnuser", "text": ".commtext"}
}]
}
}]
}
'''
```
A body with no explicit `return` resolves to `undefined`, which the eval
treats as silent — so the loop above prints nothing. The final `/eval`
yields the array, which lands on stdout as JSON: objects and arrays are
serialized automatically, so no `JSON.stringify` is needed.
`{id}` fills from each story's `id` field; the walker fetches
`/item?id=<id>`, parses it, and resolves the inner `selector`/`fields`
against that page. The whole nested result prints to stdout as one JSON
object.
**Async eval.** When a scrape needs logic `follow` can't express, `/eval`
is the escape hatch: top-level `await` works directly — the body runs as
an async function, so use `return` to produce a value. `runEval` pumps
the event loop until it settles, then surfaces the resolved value (or the
rejection as an error). A body with no explicit `return` resolves to
`undefined`, which the eval treats as silent. Returned objects and arrays
are serialized to JSON automatically, so no `JSON.stringify` is needed.
The store is **script-run scoped**: it's bound to the Session that runs
the script, and goes away when that Session does. There is no

View File

@@ -218,6 +218,7 @@ pub const Tool = enum {
\\ [{"selector":"<sel>","attr":"<name>"}] → every match's attribute (string[])
\\ [{"selector":"<sel>","fields":{…}}] → array of objects, fields resolved relative to each match
\\ add `"limit": N` inside any array's object spec to cap matches at N (works for text, attr, and fields shapes)
\\ add `"follow": <url>` to a spec to fetch a per-row sub-page and resolve `selector`/`limit`/`fields` against it instead of the current element. `<url>` is either a string template — `{name}` placeholders fill from sibling fields on the same row, e.g. "/item?id={id}" — or an element-spec `{"selector":"<sel>","attr":"href"}` read off the row. Fetches run sequentially; a failed fetch yields null/[] for that field.
\\
\\Examples (schema → result):
\\ {"karma": "#karma"} → {"karma":"42"}
@@ -225,6 +226,7 @@ pub const Tool = enum {
\\ {"top3": [{"selector":".story .title","limit":3}]} → {"top3":["A","B","C"]}
\\ {"links": [{"selector":"a.title","attr":"href"}]} → {"links":["/a","/b"]}
\\ {"stories": [{"selector":".athing","fields":{"title":".titleline","rank":".rank"}}]} → {"stories":[{"title":"Foo","rank":"1"}]}
\\ {"stories": [{"selector":"tr.athing","limit":5,"fields":{"id":{"attr":"id"},"comments":[{"follow":"/item?id={id}","selector":".comment","limit":3,"fields":{"author":".hnuser","text":".commtext"}}]}}]} → {"stories":[{"id":"1","comments":[{"author":"foo","text":"hi"}]}]}
,
.summary = "Extract structured data via a JSON schema",
.input_schema = minify(
@@ -682,19 +684,37 @@ pub fn extract(
}
// The schema literal is spliced between prefix and suffix verbatim — a format
// string here would collide with the `{`/`}` throughout the walker body.
// string here would collide with the `{`/`}` throughout the walker body. The
// walker is async (and returns a Promise<string> that `runEval` pumps) because a
// spec's `follow` key fetches a per-row sub-page; the `await` chain resolves
// synchronously when no spec follows, so non-follow schemas pay no I/O.
const schema_walker_prefix =
\\JSON.stringify((function(schema){
\\ function valueOf(m, inner){
\\(async function(schema){
\\ function fill(t, ctx){ return t.replace(/\{(\w+)\}/g, function(_, n){ return (ctx && ctx[n] != null) ? ctx[n] : ''; }); }
\\ function hasFollow(v){ const s = Array.isArray(v) ? v[0] : v; return s && typeof s === 'object' && s.follow !== undefined; }
\\ async function followRoot(el, follow, ctx){
\\ const url = (typeof follow === 'string') ? fill(follow, ctx) : await ext(el, follow);
\\ if (!url) return null;
\\ try {
\\ const html = await fetch(url).then(function(r){ return r.text(); });
\\ return new DOMParser().parseFromString(html, 'text/html');
\\ } catch (e) { return null; }
\\ }
\\ async function rootFor(el, spec, ctx){ return spec.follow !== undefined ? await followRoot(el, spec.follow, ctx) : el; }
\\ async function valueOf(m, inner){
\\ if (inner.fields) {
\\ const r = {};
\\ for (const k in inner.fields) r[k] = ext(m, inner.fields[k]);
\\ const deferred = [];
\\ for (const k in inner.fields) {
\\ if (hasFollow(inner.fields[k])) deferred.push(k); else r[k] = await ext(m, inner.fields[k]);
\\ }
\\ for (const k of deferred) r[k] = await ext(m, inner.fields[k], r);
\\ return r;
\\ }
\\ if (inner.attr) return m.getAttribute(inner.attr);
\\ return m.textContent.trim();
\\ }
\\ function ext(el, v){
\\ async function ext(el, v, ctx){
\\ if (typeof v === 'string') {
\\ if (v === '') return el.textContent.trim();
\\ const m = el.querySelector(v);
@@ -702,26 +722,32 @@ const schema_walker_prefix =
\\ }
\\ if (Array.isArray(v)) {
\\ const inner = typeof v[0] === 'string' ? { selector: v[0] } : v[0];
\\ let matches = Array.from(el.querySelectorAll(inner.selector));
\\ const root = await rootFor(el, inner, ctx);
\\ if (!root) return [];
\\ let matches = Array.from(root.querySelectorAll(inner.selector));
\\ if (typeof inner.limit === 'number') matches = matches.slice(0, inner.limit);
\\ return matches.map(function(m){ return valueOf(m, inner); });
\\ const acc = [];
\\ for (const m of matches) acc.push(await valueOf(m, inner));
\\ return acc;
\\ }
\\ const t = v.selector ? el.querySelector(v.selector) : el;
\\ const root = await rootFor(el, v, ctx);
\\ if (!root) return null;
\\ const t = v.selector ? root.querySelector(v.selector) : root;
\\ if (!t) return null;
\\ return valueOf(t, v);
\\ return await valueOf(t, v);
\\ }
\\ const out = {};
\\ let any = false;
\\ for (const k in schema) {
\\ out[k] = ext(document, schema[k]);
\\ out[k] = await ext(document, schema[k]);
\\ const v = out[k];
\\ if (v !== null && !(Array.isArray(v) && v.length === 0)) any = true;
\\ }
\\ if (!any) throw new Error("extract: no schema selector matched any element — inspect the page with tree/markdown and retry with corrected selectors");
\\ return out;
\\ return JSON.stringify(out);
\\})(
;
const schema_walker_suffix = "))";
const schema_walker_suffix = ")";
fn execGoto(arena: std.mem.Allocator, session: *lp.Session, registry: *CDPNode.Registry, arguments: ?std.json.Value) ToolError![]const u8 {
const args = try parseArgs(GotoParams, arena, arguments);

View File

@@ -954,6 +954,81 @@ test "MCP - extract: save= exposes the result as lp.<name>" {
} }, out.written());
}
test "MCP - extract: follow string template fetches a sub-page and nests fields" {
defer testing.reset();
var out: std.io.Writer.Allocating = .init(testing.arena_allocator);
const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer);
defer server.deinit();
const msg =
\\{
\\ "jsonrpc": "2.0",
\\ "id": 1,
\\ "method": "tools/call",
\\ "params": {
\\ "name": "extract",
\\ "arguments": {
\\ "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"id\":{\"attr\":\"id\"},\"comments\":[{\"follow\":\"/src/browser/tests/extract_follow_item.html?id={id}\",\"selector\":\".comment\",\"fields\":{\"author\":\".author\",\"text\":\".text\"}}]}}]}"
\\ }
\\ }
\\}
;
try router.handleMessage(server, testing.arena_allocator, msg);
try testing.expectJson(.{ .id = 1, .result = .{
.content = &.{.{ .type = "text", .text = "{\"rows\":[{\"id\":\"1\",\"comments\":[{\"author\":\"alice\",\"text\":\"hello\"},{\"author\":\"bob\",\"text\":\"world\"}]},{\"id\":\"2\",\"comments\":[{\"author\":\"alice\",\"text\":\"hello\"},{\"author\":\"bob\",\"text\":\"world\"}]}]}" }},
} }, out.written());
}
test "MCP - extract: follow href-spec resolves a link off the row and nests fields" {
defer testing.reset();
var out: std.io.Writer.Allocating = .init(testing.arena_allocator);
const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer);
defer server.deinit();
const msg =
\\{
\\ "jsonrpc": "2.0",
\\ "id": 1,
\\ "method": "tools/call",
\\ "params": {
\\ "name": "extract",
\\ "arguments": {
\\ "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"comments\":[{\"follow\":{\"selector\":\"a.link\",\"attr\":\"href\"},\"selector\":\".comment\",\"limit\":1,\"fields\":{\"author\":\".author\"}}]}}]}"
\\ }
\\ }
\\}
;
try router.handleMessage(server, testing.arena_allocator, msg);
try testing.expectJson(.{ .id = 1, .result = .{
.content = &.{.{ .type = "text", .text = "{\"rows\":[{\"comments\":[{\"author\":\"alice\"}]},{\"comments\":[{\"author\":\"alice\"}]}]}" }},
} }, out.written());
}
test "MCP - extract: follow to a missing page yields [] without failing the extract" {
defer testing.reset();
var out: std.io.Writer.Allocating = .init(testing.arena_allocator);
const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer);
defer server.deinit();
const msg =
\\{
\\ "jsonrpc": "2.0",
\\ "id": 1,
\\ "method": "tools/call",
\\ "params": {
\\ "name": "extract",
\\ "arguments": {
\\ "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"id\":{\"attr\":\"id\"},\"comments\":[{\"follow\":\"/src/browser/tests/does_not_exist.html\",\"selector\":\".comment\",\"fields\":{\"author\":\".author\"}}]}}]}"
\\ }
\\ }
\\}
;
try router.handleMessage(server, testing.arena_allocator, msg);
try testing.expectJson(.{ .id = 1, .result = .{
.content = &.{.{ .type = "text", .text = "{\"rows\":[{\"id\":\"1\",\"comments\":[]},{\"id\":\"2\",\"comments\":[]}]}" }},
} }, out.written());
}
test "MCP - eval: Promise.resolve return value is awaited" {
defer testing.reset();
var out: std.io.Writer.Allocating = .init(testing.arena_allocator);