mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-06-11 01:25:53 -04:00
extract: add declarative follow option
Allows fetching sub-pages per row and resolving nested fields against
them. Supports string templates with sibling placeholders (e.g., `{id}`)
and element-specs. Updates the JS walker to be async.
This commit is contained in:
@@ -144,6 +144,15 @@ as a single JSON object. Supported value forms:
|
||||
- Add `"limit": N` inside any array's object spec to cap matches at N
|
||||
(works for text, attribute, and `fields` shapes — e.g.
|
||||
`[{"selector": ".story .title", "limit": 5}]` for top 5 titles).
|
||||
- Add `"follow": <url>` to a spec to fetch a per-row sub-page and resolve
|
||||
the spec's `selector`/`limit`/`fields` against *that* document instead
|
||||
of the current element — a declarative "scrape a list, then visit each
|
||||
row." `<url>` is either a string template whose `{name}` placeholders
|
||||
fill from sibling fields on the same row (`"/item?id={id}"`), or an
|
||||
element-spec read off the row (`{"selector": "a.comments", "attr":
|
||||
"href"}`). Fetches resolve relative to the current page and run
|
||||
sequentially; a failed fetch yields `null` (or `[]`) for that field.
|
||||
See the worked example below.
|
||||
|
||||
Use `/extract '''…'''` (or `"""…"""`) to spread a schema across multiple
|
||||
lines. The schema is parsed in Zig before the page-side walker runs,
|
||||
@@ -192,32 +201,46 @@ the end of the call. Adding a key (`lp.x = …`), updating a nested value
|
||||
update — even after a navigation, because the store lives Session-side,
|
||||
not on the page.
|
||||
|
||||
**Async eval.** Top-level `await` works directly — the body runs as an
|
||||
async function, so use `return` to produce a value. `runEval` pumps the
|
||||
event loop until it settles, then surfaces the resolved value (or the
|
||||
rejection as an error). Combined with the bridge this lets a single
|
||||
`/eval` do an async `fetch` loop over `lp.*` data:
|
||||
**List → detail with `follow`.** A common scrape captures a list, then
|
||||
visits each row for more data. `/extract`'s `follow` does that in one
|
||||
declarative call — no `lp.*` round-trip, no hand-written loop. The HN
|
||||
front page plus the top comments of each story:
|
||||
|
||||
```pandascript
|
||||
/eval '''
|
||||
for (const s of lp.front.stories) {
|
||||
const html = await fetch('/item?id=' + s.id).then(r => r.text());
|
||||
const doc = new DOMParser().parseFromString(html, 'text/html');
|
||||
s.comments = [...doc.querySelectorAll('tr.athing.comtr')].slice(0, 3)
|
||||
.map(r => r.querySelector('.commtext')?.textContent.trim())
|
||||
.filter(Boolean);
|
||||
}
|
||||
'''
|
||||
/goto 'https://news.ycombinator.com/'
|
||||
|
||||
/eval '''
|
||||
lp.front.stories
|
||||
/extract '''
|
||||
{
|
||||
"stories": [{
|
||||
"selector": "tr.athing",
|
||||
"limit": 5,
|
||||
"fields": {
|
||||
"id": {"attr": "id"},
|
||||
"title": ".titleline > a",
|
||||
"comments": [{
|
||||
"follow": "/item?id={id}",
|
||||
"selector": "tr.athing.comtr:has(td.ind img[width=\"0\"]):has(.commtext)",
|
||||
"limit": 3,
|
||||
"fields": {"author": ".hnuser", "text": ".commtext"}
|
||||
}]
|
||||
}
|
||||
}]
|
||||
}
|
||||
'''
|
||||
```
|
||||
|
||||
A body with no explicit `return` resolves to `undefined`, which the eval
|
||||
treats as silent — so the loop above prints nothing. The final `/eval`
|
||||
yields the array, which lands on stdout as JSON: objects and arrays are
|
||||
serialized automatically, so no `JSON.stringify` is needed.
|
||||
`{id}` fills from each story's `id` field; the walker fetches
|
||||
`/item?id=<id>`, parses it, and resolves the inner `selector`/`fields`
|
||||
against that page. The whole nested result prints to stdout as one JSON
|
||||
object.
|
||||
|
||||
**Async eval.** When a scrape needs logic `follow` can't express, `/eval`
|
||||
is the escape hatch: top-level `await` works directly — the body runs as
|
||||
an async function, so use `return` to produce a value. `runEval` pumps
|
||||
the event loop until it settles, then surfaces the resolved value (or the
|
||||
rejection as an error). A body with no explicit `return` resolves to
|
||||
`undefined`, which the eval treats as silent. Returned objects and arrays
|
||||
are serialized to JSON automatically, so no `JSON.stringify` is needed.
|
||||
|
||||
The store is **script-run scoped**: it's bound to the Session that runs
|
||||
the script, and goes away when that Session does. There is no
|
||||
|
||||
@@ -218,6 +218,7 @@ pub const Tool = enum {
|
||||
\\ [{"selector":"<sel>","attr":"<name>"}] → every match's attribute (string[])
|
||||
\\ [{"selector":"<sel>","fields":{…}}] → array of objects, fields resolved relative to each match
|
||||
\\ add `"limit": N` inside any array's object spec to cap matches at N (works for text, attr, and fields shapes)
|
||||
\\ add `"follow": <url>` to a spec to fetch a per-row sub-page and resolve `selector`/`limit`/`fields` against it instead of the current element. `<url>` is either a string template — `{name}` placeholders fill from sibling fields on the same row, e.g. "/item?id={id}" — or an element-spec `{"selector":"<sel>","attr":"href"}` read off the row. Fetches run sequentially; a failed fetch yields null/[] for that field.
|
||||
\\
|
||||
\\Examples (schema → result):
|
||||
\\ {"karma": "#karma"} → {"karma":"42"}
|
||||
@@ -225,6 +226,7 @@ pub const Tool = enum {
|
||||
\\ {"top3": [{"selector":".story .title","limit":3}]} → {"top3":["A","B","C"]}
|
||||
\\ {"links": [{"selector":"a.title","attr":"href"}]} → {"links":["/a","/b"]}
|
||||
\\ {"stories": [{"selector":".athing","fields":{"title":".titleline","rank":".rank"}}]} → {"stories":[{"title":"Foo","rank":"1"}]}
|
||||
\\ {"stories": [{"selector":"tr.athing","limit":5,"fields":{"id":{"attr":"id"},"comments":[{"follow":"/item?id={id}","selector":".comment","limit":3,"fields":{"author":".hnuser","text":".commtext"}}]}}]} → {"stories":[{"id":"1","comments":[{"author":"foo","text":"hi"}]}]}
|
||||
,
|
||||
.summary = "Extract structured data via a JSON schema",
|
||||
.input_schema = minify(
|
||||
@@ -682,19 +684,37 @@ pub fn extract(
|
||||
}
|
||||
|
||||
// The schema literal is spliced between prefix and suffix verbatim — a format
|
||||
// string here would collide with the `{`/`}` throughout the walker body.
|
||||
// string here would collide with the `{`/`}` throughout the walker body. The
|
||||
// walker is async (and returns a Promise<string> that `runEval` pumps) because a
|
||||
// spec's `follow` key fetches a per-row sub-page; the `await` chain resolves
|
||||
// synchronously when no spec follows, so non-follow schemas pay no I/O.
|
||||
const schema_walker_prefix =
|
||||
\\JSON.stringify((function(schema){
|
||||
\\ function valueOf(m, inner){
|
||||
\\(async function(schema){
|
||||
\\ function fill(t, ctx){ return t.replace(/\{(\w+)\}/g, function(_, n){ return (ctx && ctx[n] != null) ? ctx[n] : ''; }); }
|
||||
\\ function hasFollow(v){ const s = Array.isArray(v) ? v[0] : v; return s && typeof s === 'object' && s.follow !== undefined; }
|
||||
\\ async function followRoot(el, follow, ctx){
|
||||
\\ const url = (typeof follow === 'string') ? fill(follow, ctx) : await ext(el, follow);
|
||||
\\ if (!url) return null;
|
||||
\\ try {
|
||||
\\ const html = await fetch(url).then(function(r){ return r.text(); });
|
||||
\\ return new DOMParser().parseFromString(html, 'text/html');
|
||||
\\ } catch (e) { return null; }
|
||||
\\ }
|
||||
\\ async function rootFor(el, spec, ctx){ return spec.follow !== undefined ? await followRoot(el, spec.follow, ctx) : el; }
|
||||
\\ async function valueOf(m, inner){
|
||||
\\ if (inner.fields) {
|
||||
\\ const r = {};
|
||||
\\ for (const k in inner.fields) r[k] = ext(m, inner.fields[k]);
|
||||
\\ const deferred = [];
|
||||
\\ for (const k in inner.fields) {
|
||||
\\ if (hasFollow(inner.fields[k])) deferred.push(k); else r[k] = await ext(m, inner.fields[k]);
|
||||
\\ }
|
||||
\\ for (const k of deferred) r[k] = await ext(m, inner.fields[k], r);
|
||||
\\ return r;
|
||||
\\ }
|
||||
\\ if (inner.attr) return m.getAttribute(inner.attr);
|
||||
\\ return m.textContent.trim();
|
||||
\\ }
|
||||
\\ function ext(el, v){
|
||||
\\ async function ext(el, v, ctx){
|
||||
\\ if (typeof v === 'string') {
|
||||
\\ if (v === '') return el.textContent.trim();
|
||||
\\ const m = el.querySelector(v);
|
||||
@@ -702,26 +722,32 @@ const schema_walker_prefix =
|
||||
\\ }
|
||||
\\ if (Array.isArray(v)) {
|
||||
\\ const inner = typeof v[0] === 'string' ? { selector: v[0] } : v[0];
|
||||
\\ let matches = Array.from(el.querySelectorAll(inner.selector));
|
||||
\\ const root = await rootFor(el, inner, ctx);
|
||||
\\ if (!root) return [];
|
||||
\\ let matches = Array.from(root.querySelectorAll(inner.selector));
|
||||
\\ if (typeof inner.limit === 'number') matches = matches.slice(0, inner.limit);
|
||||
\\ return matches.map(function(m){ return valueOf(m, inner); });
|
||||
\\ const acc = [];
|
||||
\\ for (const m of matches) acc.push(await valueOf(m, inner));
|
||||
\\ return acc;
|
||||
\\ }
|
||||
\\ const t = v.selector ? el.querySelector(v.selector) : el;
|
||||
\\ const root = await rootFor(el, v, ctx);
|
||||
\\ if (!root) return null;
|
||||
\\ const t = v.selector ? root.querySelector(v.selector) : root;
|
||||
\\ if (!t) return null;
|
||||
\\ return valueOf(t, v);
|
||||
\\ return await valueOf(t, v);
|
||||
\\ }
|
||||
\\ const out = {};
|
||||
\\ let any = false;
|
||||
\\ for (const k in schema) {
|
||||
\\ out[k] = ext(document, schema[k]);
|
||||
\\ out[k] = await ext(document, schema[k]);
|
||||
\\ const v = out[k];
|
||||
\\ if (v !== null && !(Array.isArray(v) && v.length === 0)) any = true;
|
||||
\\ }
|
||||
\\ if (!any) throw new Error("extract: no schema selector matched any element — inspect the page with tree/markdown and retry with corrected selectors");
|
||||
\\ return out;
|
||||
\\ return JSON.stringify(out);
|
||||
\\})(
|
||||
;
|
||||
const schema_walker_suffix = "))";
|
||||
const schema_walker_suffix = ")";
|
||||
|
||||
fn execGoto(arena: std.mem.Allocator, session: *lp.Session, registry: *CDPNode.Registry, arguments: ?std.json.Value) ToolError![]const u8 {
|
||||
const args = try parseArgs(GotoParams, arena, arguments);
|
||||
|
||||
@@ -954,6 +954,81 @@ test "MCP - extract: save= exposes the result as lp.<name>" {
|
||||
} }, out.written());
|
||||
}
|
||||
|
||||
test "MCP - extract: follow string template fetches a sub-page and nests fields" {
|
||||
defer testing.reset();
|
||||
var out: std.io.Writer.Allocating = .init(testing.arena_allocator);
|
||||
const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer);
|
||||
defer server.deinit();
|
||||
|
||||
const msg =
|
||||
\\{
|
||||
\\ "jsonrpc": "2.0",
|
||||
\\ "id": 1,
|
||||
\\ "method": "tools/call",
|
||||
\\ "params": {
|
||||
\\ "name": "extract",
|
||||
\\ "arguments": {
|
||||
\\ "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"id\":{\"attr\":\"id\"},\"comments\":[{\"follow\":\"/src/browser/tests/extract_follow_item.html?id={id}\",\"selector\":\".comment\",\"fields\":{\"author\":\".author\",\"text\":\".text\"}}]}}]}"
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
try router.handleMessage(server, testing.arena_allocator, msg);
|
||||
try testing.expectJson(.{ .id = 1, .result = .{
|
||||
.content = &.{.{ .type = "text", .text = "{\"rows\":[{\"id\":\"1\",\"comments\":[{\"author\":\"alice\",\"text\":\"hello\"},{\"author\":\"bob\",\"text\":\"world\"}]},{\"id\":\"2\",\"comments\":[{\"author\":\"alice\",\"text\":\"hello\"},{\"author\":\"bob\",\"text\":\"world\"}]}]}" }},
|
||||
} }, out.written());
|
||||
}
|
||||
|
||||
test "MCP - extract: follow href-spec resolves a link off the row and nests fields" {
|
||||
defer testing.reset();
|
||||
var out: std.io.Writer.Allocating = .init(testing.arena_allocator);
|
||||
const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer);
|
||||
defer server.deinit();
|
||||
|
||||
const msg =
|
||||
\\{
|
||||
\\ "jsonrpc": "2.0",
|
||||
\\ "id": 1,
|
||||
\\ "method": "tools/call",
|
||||
\\ "params": {
|
||||
\\ "name": "extract",
|
||||
\\ "arguments": {
|
||||
\\ "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"comments\":[{\"follow\":{\"selector\":\"a.link\",\"attr\":\"href\"},\"selector\":\".comment\",\"limit\":1,\"fields\":{\"author\":\".author\"}}]}}]}"
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
try router.handleMessage(server, testing.arena_allocator, msg);
|
||||
try testing.expectJson(.{ .id = 1, .result = .{
|
||||
.content = &.{.{ .type = "text", .text = "{\"rows\":[{\"comments\":[{\"author\":\"alice\"}]},{\"comments\":[{\"author\":\"alice\"}]}]}" }},
|
||||
} }, out.written());
|
||||
}
|
||||
|
||||
test "MCP - extract: follow to a missing page yields [] without failing the extract" {
|
||||
defer testing.reset();
|
||||
var out: std.io.Writer.Allocating = .init(testing.arena_allocator);
|
||||
const server = try testLoadPage("http://localhost:9582/src/browser/tests/extract_follow_list.html", &out.writer);
|
||||
defer server.deinit();
|
||||
|
||||
const msg =
|
||||
\\{
|
||||
\\ "jsonrpc": "2.0",
|
||||
\\ "id": 1,
|
||||
\\ "method": "tools/call",
|
||||
\\ "params": {
|
||||
\\ "name": "extract",
|
||||
\\ "arguments": {
|
||||
\\ "schema": "{\"rows\":[{\"selector\":\"tr.row\",\"fields\":{\"id\":{\"attr\":\"id\"},\"comments\":[{\"follow\":\"/src/browser/tests/does_not_exist.html\",\"selector\":\".comment\",\"fields\":{\"author\":\".author\"}}]}}]}"
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
try router.handleMessage(server, testing.arena_allocator, msg);
|
||||
try testing.expectJson(.{ .id = 1, .result = .{
|
||||
.content = &.{.{ .type = "text", .text = "{\"rows\":[{\"id\":\"1\",\"comments\":[]},{\"id\":\"2\",\"comments\":[]}]}" }},
|
||||
} }, out.written());
|
||||
}
|
||||
|
||||
test "MCP - eval: Promise.resolve return value is awaited" {
|
||||
defer testing.reset();
|
||||
var out: std.io.Writer.Allocating = .init(testing.arena_allocator);
|
||||
|
||||
Reference in New Issue
Block a user