diff --git a/src/Config.zig b/src/Config.zig index 63aa3d94..f0fd8c3a 100644 --- a/src/Config.zig +++ b/src/Config.zig @@ -140,6 +140,7 @@ const Commands = cli.Builder(.{ .{ .name = "wait_until", .type = ?WaitUntil }, .{ .name = "wait_script", .type = ?[:0]const u8 }, .{ .name = "wait_selector", .type = ?[:0]const u8 }, + .{ .name = "terminate_ms", .type = ?u32 }, }, .shared_options = CommonOptions, }, @@ -638,6 +639,12 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void { \\--wait-script-file \\ Like --wait-script, but reads the script from a file. \\ + \\--terminate-ms Hard deadline in milliseconds. After this time elapses, + \\ JavaScript execution is forcibly terminated (e.g. for + \\ pages with endless scripts). Unlike --wait-ms, which + \\ only stops waiting, --terminate-ms aborts the page. + \\ Defaults to no terminate. + \\ \\--cookie Path to a JSON file to load cookies from (read-only). \\ Defaults to no cookie loading. \\ diff --git a/src/Sighandler.zig b/src/Sighandler.zig index 2b2d7f29..85a8d8e5 100644 --- a/src/Sighandler.zig +++ b/src/Sighandler.zig @@ -22,11 +22,11 @@ //! The structure does not clear the memory allocated in the arena, //! clear the entire arena when exiting the program. const std = @import("std"); -const assert = std.debug.assert; -const Allocator = std.mem.Allocator; const lp = @import("lightpanda"); const log = lp.log; +const assert = std.debug.assert; +const Allocator = std.mem.Allocator; const SigHandler = @This(); @@ -44,17 +44,41 @@ pub const Listener = struct { }; pub fn install(self: *SigHandler) !void { - // Block SIGINT and SIGTERM for the current thread and all created from it + // Block these signals for the current thread and all created from it. + // SIGALRM is included so arm() can wake the sighandler thread on a deadline. self.sigset = std.posix.sigemptyset(); std.posix.sigaddset(&self.sigset, std.posix.SIG.INT); std.posix.sigaddset(&self.sigset, std.posix.SIG.TERM); std.posix.sigaddset(&self.sigset, std.posix.SIG.QUIT); + std.posix.sigaddset(&self.sigset, std.posix.SIG.ALRM); std.posix.sigprocmask(std.posix.SIG.BLOCK, &self.sigset, null); self.handle_thread = try std.Thread.spawn(.{ .allocator = self.arena }, SigHandler.sighandle, .{self}); self.handle_thread.?.detach(); } +const itimerval = extern struct { + interval: std.c.timeval, + value: std.c.timeval, +}; +const ITIMER_REAL: c_int = 0; +extern "c" fn setitimer(which: c_int, new_value: *const itimerval, old_value: ?*itimerval) c_int; + +/// Schedule a SIGALRM after `ms` milliseconds, which wakes the sighandler +/// thread and runs the registered listeners. Used to enforce --terminate-ms. +pub fn deadline(_: *SigHandler, ms: u32) !void { + const it = itimerval{ + .interval = .{ .sec = 0, .usec = 0 }, + .value = .{ + .sec = @intCast(ms / std.time.ms_per_s), + .usec = @intCast((ms % std.time.ms_per_s) * std.time.us_per_ms), + }, + }; + if (setitimer(ITIMER_REAL, &it, null) != 0) { + return error.SetItimerFailed; + } +} + pub fn on(self: *SigHandler, func: anytype, args: std.meta.ArgsTuple(@TypeOf(func))) !void { assert(@typeInfo(@TypeOf(func)).@"fn".return_type.? == void); @@ -101,6 +125,16 @@ fn sighandle(self: *SigHandler) noreturn { } continue; }, + std.posix.SIG.ALRM => { + // Deadline tripped (e.g. --terminate-ms). Run the same listeners, + // but don't bump `attempt` — a subsequent ctrl-c should still get + // the normal first-attempt graceful path before hard-exiting. + log.info(.app, "Deadline reached ", .{}); + for (self.listeners.items) |*item| { + item.start(item.args.ptr); + } + continue; + }, else => continue, } } diff --git a/src/browser/Frame.zig b/src/browser/Frame.zig index 872f2389..7faedb4b 100644 --- a/src/browser/Frame.zig +++ b/src/browser/Frame.zig @@ -1380,8 +1380,22 @@ pub fn removeElementIdWithMaps(self: *Frame, id_maps: ElementIdMaps, id: []const pub fn getElementByIdFromNode(self: *Frame, node: *Node, id: []const u8) ?*Element { if (node.isConnected() or node.isInShadowTree()) { - const lookup = self.getElementIdMap(node).lookup; - return lookup.get(id); + var current = node; + while (true) { + if (current.is(ShadowRoot)) |shadow_root| { + return shadow_root.getElementById(id, self); + } + const parent = current._parent orelse { + if (current._type == .document) { + return current._type.document.getElementById(id, self); + } + if (IS_DEBUG) { + std.debug.assert(false); + } + return null; + }; + current = parent; + } } var tw = @import("webapi/TreeWalker.zig").Full.Elements.init(node, .{}); while (tw.next()) |el| { @@ -3715,7 +3729,15 @@ pub fn submitForm(self: *Frame, submitter_: ?*Element, form_: ?*Element.Html.For }; if (submit_opts.fire_event) { - const submitter_html: ?*HtmlElement = if (submitter_) |s| s.is(HtmlElement) else null; + // Per HTML spec "submit a form element" algorithm: SubmitEvent.submitter + // must be null when the submitter is the form itself, which is what + // Form.requestSubmit() passes when called with no submitter argument. + // https://html.spec.whatwg.org/multipage/form-control-infrastructure.html#concept-form-submit + const submitter_html: ?*HtmlElement = blk: { + const s = submitter_ orelse break :blk null; + if (s == form_element) break :blk null; + break :blk s.is(HtmlElement); + }; const submit_event = (try SubmitEvent.initTrusted(comptime .wrap("submit"), .{ .bubbles = true, .cancelable = true, .submitter = submitter_html }, self)).asEvent(); // so submit_event is still valid when we check _prevent_default diff --git a/src/browser/js/Env.zig b/src/browser/js/Env.zig index 7754d8a3..8ea3c9bc 100644 --- a/src/browser/js/Env.zig +++ b/src/browser/js/Env.zig @@ -362,6 +362,10 @@ pub fn runMicrotasks(self: *Env) void { if (self.microtask_queues_are_running == false) { const v8_isolate = self.isolate.handle; + if (v8.v8__Isolate__IsExecutionTerminating(v8_isolate)) { + return; + } + self.microtask_queues_are_running = true; defer self.microtask_queues_are_running = false; @@ -374,6 +378,10 @@ pub fn runMicrotasks(self: *Env) void { } pub fn runMacrotasks(self: *Env) !void { + if (v8.v8__Isolate__IsExecutionTerminating(self.isolate.handle)) { + return; + } + for (self.contexts[0..self.context_count]) |ctx| { if (comptime builtin.is_test == false) { // I hate this comptime check as much as you do. But we have tests @@ -485,6 +493,13 @@ pub fn terminate(self: *const Env) void { v8.v8__Isolate__TerminateExecution(self.isolate.handle); } +/// Clears a pending termination so V8 calls (e.g. those made during cleanup) +/// don't keep tripping over the terminating-state asserts. Safe to call +/// unconditionally; a no-op if termination wasn't pending. +pub fn cancelTerminate(self: *const Env) void { + v8.v8__Isolate__CancelTerminateExecution(self.isolate.handle); +} + fn promiseRejectCallback(message_handle: v8.PromiseRejectMessage) callconv(.c) void { const promise_event = v8.v8__PromiseRejectMessage__GetEvent(&message_handle); if (promise_event != v8.kPromiseRejectWithNoHandler and promise_event != v8.kPromiseHandlerAddedAfterReject) { diff --git a/src/browser/tests/element/duplicate_ids.html b/src/browser/tests/element/duplicate_ids.html index 52ef7da6..2e8b5894 100644 --- a/src/browser/tests/element/duplicate_ids.html +++ b/src/browser/tests/element/duplicate_ids.html @@ -17,3 +17,24 @@ // testing.expectEqual(null, document.getElementById('test')); + +
first
+
second
+ + diff --git a/src/browser/tests/element/html/form.html b/src/browser/tests/element/html/form.html index 31f5d648..8a92cb10 100644 --- a/src/browser/tests/element/html/form.html +++ b/src/browser/tests/element/html/form.html @@ -485,12 +485,13 @@ } - +
- diff --git a/src/cdp/domains/network.zig b/src/cdp/domains/network.zig index c60aa7a7..ff5778e9 100644 --- a/src/cdp/domains/network.zig +++ b/src/cdp/domains/network.zig @@ -45,6 +45,7 @@ pub fn processMessage(cmd: *CDP.Command) !void { setCookie, setCookies, getCookies, + getAllCookies, getResponseBody, }, cmd.input.action) orelse return error.UnknownMethod; @@ -59,6 +60,7 @@ pub fn processMessage(cmd: *CDP.Command) !void { .setCookie => return setCookie(cmd), .setCookies => return setCookies(cmd), .getCookies => return getCookies(cmd), + .getAllCookies => return getAllCookies(cmd), .getResponseBody => return getResponseBody(cmd), } } @@ -149,7 +151,10 @@ fn deleteCookies(cmd: *CDP.Command) !void { } fn clearBrowserCookies(cmd: *CDP.Command) !void { - if (try cmd.params(struct {}) != null) return error.InvalidParams; + // Network.clearBrowserCookies takes no parameters per the CDP spec, but most + // CDP clients (chrome-remote-interface, chromedp, custom websocket clients) + // include an empty `"params":{}` object on every command for ergonomics. + // Chrome accepts that and clears the jar; reject only on truly malformed JSON. const bc = cmd.browser_context orelse return error.BrowserContextNotLoaded; bc.session.cookie_jar.clearRetainingCapacity(); return cmd.sendResult(null, .{}); @@ -205,6 +210,18 @@ fn getCookies(cmd: *CDP.Command) !void { try cmd.sendResult(.{ .cookies = writer }, .{}); } +fn getAllCookies(cmd: *CDP.Command) !void { + // Returns every cookie in the jar regardless of the current frame's origin. + // Mirrors Chrome's Network.getAllCookies and Storage.getCookies (without + // the latter's browserContextId filter, since Network commands are scoped + // to the current browser context already). + const bc = cmd.browser_context orelse return error.BrowserContextNotLoaded; + var jar = &bc.session.cookie_jar; + jar.removeExpired(null); + const writer = CdpStorage.CookieWriter{ .cookies = jar.cookies.items }; + try cmd.sendResult(.{ .cookies = writer }, .{}); +} + fn getResponseBody(cmd: *CDP.Command) !void { const params = (try cmd.params(struct { requestId: []const u8, // "REQ-{d}" or "LID-{d}" @@ -565,3 +582,83 @@ test "cdp.Network: cookies" { }); try ctx.expectSentResult(.{ .cookies = &[_]ResCookie{} }, .{ .id = 10 }); } + +test "cdp.Network: clearBrowserCookies accepts empty params object" { + const CdpCookie = CdpStorage.CdpCookie; + const ResCookie = CdpStorage.ResCookie; + + var ctx = try testing.context(); + defer ctx.deinit(); + _ = try ctx.loadBrowserContext(.{ .id = "BID-N1" }); + + try ctx.processMessage(.{ + .id = 1, + .method = "Network.setCookie", + .params = CdpCookie{ .name = "foo", .value = "bar", .url = "https://example.com/" }, + }); + try ctx.expectSentResult(null, .{ .id = 1 }); + + // Most CDP clients (chrome-remote-interface, chromedp, etc.) always include + // a `params` field on every command, even for methods that take none. + // Chrome ignores the empty object; we should too. Sent as raw JSON because + // an empty Zig anonymous struct serializes as `[]`, not `{}`. + try ctx.processMessage( + \\{"id":2,"method":"Network.clearBrowserCookies","params":{}} + ); + try ctx.expectSentResult(null, .{ .id = 2 }); + + try ctx.processMessage(.{ + .id = 3, + .method = "Storage.getCookies", + .params = .{ .browserContextId = "BID-N1" }, + }); + try ctx.expectSentResult(.{ .cookies = &[_]ResCookie{} }, .{ .id = 3 }); +} + +test "cdp.Network: getAllCookies returns whole jar regardless of current origin" { + const CdpCookie = CdpStorage.CdpCookie; + const ResCookie = CdpStorage.ResCookie; + + var ctx = try testing.context(); + defer ctx.deinit(); + _ = try ctx.loadBrowserContext(.{ .id = "BID-N2" }); + + // Two cookies on different origins. With no current frame URL, + // Network.getCookies (no `urls`) would return -32602 InvalidParams; + // Network.getAllCookies must still return both. + try ctx.processMessage(.{ + .id = 1, + .method = "Network.setCookies", + .params = .{ + .cookies = &[_]CdpCookie{ + .{ .name = "a", .value = "1", .url = "https://example.com/" }, + .{ .name = "b", .value = "2", .url = "https://other.test/" }, + }, + }, + }); + try ctx.expectSentResult(null, .{ .id = 1 }); + + // Empty params object — sent as raw JSON because an empty Zig anonymous + // struct serializes as `[]`, not `{}`. + try ctx.processMessage( + \\{"id":2,"method":"Network.getAllCookies","params":{}} + ); + try ctx.expectSentResult(.{ + .cookies = &[_]ResCookie{ + .{ .name = "a", .value = "1", .domain = "example.com", .path = "/", .size = 2, .secure = true }, + .{ .name = "b", .value = "2", .domain = "other.test", .path = "/", .size = 2, .secure = true }, + }, + }, .{ .id = 2 }); + + // Also works without any params field at all (CDP-spec literal "no params"). + try ctx.processMessage(.{ + .id = 3, + .method = "Network.getAllCookies", + }); + try ctx.expectSentResult(.{ + .cookies = &[_]ResCookie{ + .{ .name = "a", .value = "1", .domain = "example.com", .path = "/", .size = 2, .secure = true }, + .{ .name = "b", .value = "2", .domain = "other.test", .path = "/", .size = 2, .secure = true }, + }, + }, .{ .id = 3 }); +} diff --git a/src/lightpanda.zig b/src/lightpanda.zig index 292ba44c..bdeac93f 100644 --- a/src/lightpanda.zig +++ b/src/lightpanda.zig @@ -62,16 +62,10 @@ pub const FetchOpts = struct { dump_mode: ?Config.DumpFormat = null, writer: ?*std.Io.Writer = null, }; -pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void { - const http_client = try HttpClient.init(app.allocator, &app.network); - defer http_client.deinit(); - +pub fn fetch(app: *App, browser: *Browser, url: [:0]const u8, opts: FetchOpts) !void { const notification = try Notification.init(app.allocator); defer notification.deinit(); - var browser = try Browser.init(app, .{ .http_client = http_client }); - defer browser.deinit(); - var session = try browser.newSession(notification); if (app.config.cookieFile()) |cookie_path| { diff --git a/src/main.zig b/src/main.zig index 00940c5d..378af6e2 100644 --- a/src/main.zig +++ b/src/main.zig @@ -139,7 +139,18 @@ fn run(allocator: Allocator, main_arena: Allocator) !void { fetch_opts.writer = &writer.interface; } - var worker_thread = try std.Thread.spawn(.{}, fetchThread, .{ app, url.?, fetch_opts }); + // Browser owns a V8 isolate, which has thread affinity — it must + // be init/used/deinit on the same thread (fetchThread, below). So + // we can't treat Browser like the above serve path treats Server. + // We need Browser to be createdin fetchThread and to get a reference + // to it here. + var ft: FetchTerminator = .{}; + try sighandler.on(FetchTerminator.terminate, .{&ft}); + if (opts.terminate_ms) |ms| { + try sighandler.deadline(ms); + } + + var worker_thread = try std.Thread.spawn(.{}, fetchThread, .{ app, &ft, url.?, fetch_opts }); defer worker_thread.join(); app.network.run(); @@ -197,9 +208,55 @@ fn agentThread(allocator: std.mem.Allocator, app: *App, opts: Config.Agent, fail } } -fn fetchThread(app: *App, url: [:0]const u8, fetch_opts: lp.FetchOpts) void { +const FetchTerminator = struct { + mutex: std.Thread.Mutex = .{}, + browser: ?*lp.Browser = null, + + fn storeBrowser(self: *FetchTerminator, browser: *lp.Browser) void { + self.mutex.lock(); + defer self.mutex.unlock(); + self.browser = browser; + } + + fn releaseBrowser(self: *FetchTerminator) void { + self.mutex.lock(); + defer self.mutex.unlock(); + const b = self.browser orelse return; + b.env.cancelTerminate(); + self.browser = null; + } + + fn terminate(self: *FetchTerminator) void { + self.mutex.lock(); + defer self.mutex.unlock(); + const b = self.browser orelse return; + b.env.terminate(); + self.browser = null; + } +}; + +fn fetchThread(app: *App, ft: *FetchTerminator, url: [:0]const u8, fetch_opts: lp.FetchOpts) void { defer app.network.stop(); - lp.fetch(app, url, fetch_opts) catch |err| { + + const http_client = lp.HttpClient.init(app.allocator, &app.network) catch |err| { + log.fatal(.app, "http client init error", .{ .err = err }); + return; + }; + defer http_client.deinit(); + + var browser = lp.Browser.init(app, .{ .http_client = http_client }) catch |err| { + log.fatal(.app, "browser init error", .{ .err = err }); + return; + }; + defer browser.deinit(); + + ft.storeBrowser(&browser); + // if this exits normally, we want to disarm the FetchTerminator so that + // any subsequent sighandlers don't try to shutdown an already (or in-the- + // process-of) shutting down browser/env + defer ft.releaseBrowser(); + + lp.fetch(app, &browser, url, fetch_opts) catch |err| { log.fatal(.app, "fetch error", .{ .err = err, .url = url }); }; }