From 05229fdc536645f9f21e2f40d5d2dfa2c5ed46e3 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Fri, 10 Apr 2026 14:41:19 +0800 Subject: [PATCH] Use the document's charset to determine if/how to encode querystring Whenever we resolve a URL, say from `anchor.href`, we should consider the document's charset when encoding the querystring. This probably isn't the most important feature, but it makes tens of thousands of WPT cases pass, e.g /encoding/legacy-mb-tchinese/big5/big5-encode-href-errors-han.html?3001-4000 and /encoding/legacy-mb-japanese/euc-jp/eucjp-encode-href-errors-han.html?17001-18000 DOM elements previous called `URL.resolveURL(...)`. They now call `self.asNode().resolveURL(...)`, where `Node#resolveURL` will provide the document's charset. --- src/SemanticTree.zig | 2 +- src/browser/Page.zig | 4 +- src/browser/URL.zig | 91 ++++++++++++---- src/browser/interactive.zig | 2 +- src/browser/markdown.zig | 6 +- src/browser/parser/html5ever.zig | 24 +++++ src/browser/structured_data.zig | 2 +- src/browser/tests/page/encoding.html | 29 ++++++ src/browser/webapi/Node.zig | 13 +++ src/browser/webapi/element/html/Anchor.zig | 7 +- src/browser/webapi/element/html/Form.zig | 2 +- src/browser/webapi/element/html/IFrame.zig | 4 +- src/browser/webapi/element/html/Image.zig | 4 +- src/browser/webapi/element/html/Link.zig | 4 +- src/browser/webapi/element/html/Media.zig | 3 +- src/browser/webapi/element/html/Script.zig | 4 +- src/browser/webapi/element/html/Video.zig | 4 +- src/browser/webapi/net/WebSocket.zig | 2 +- src/browser/webapi/net/XMLHttpRequest.zig | 2 +- src/cdp/domains/page.zig | 2 +- src/cdp/domains/target.zig | 2 +- src/html5ever/lib.rs | 114 +++++++++++++++++++++ src/lightpanda.zig | 2 +- 23 files changed, 276 insertions(+), 53 deletions(-) diff --git a/src/SemanticTree.zig b/src/SemanticTree.zig index 9bca520e..5b3f0ae5 100644 --- a/src/SemanticTree.zig +++ b/src/SemanticTree.zig @@ -671,7 +671,7 @@ pub fn getNodeDetails( if (el.getAttributeSafe(comptime .wrap("href"))) |h| { const URL = lp.URL; - href = URL.resolve(arena, page.base(), h, .{ .encode = true }) catch h; + href = URL.resolve(arena, page.base(), h, .{ .encoding = page.charset }) catch h; } if (el.is(Element.Html.Input)) |input| { diff --git a/src/browser/Page.zig b/src/browser/Page.zig index 7c66faff..1c3d39f0 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -661,7 +661,7 @@ fn scheduleNavigationWithArena(originator: *Page, arena: Allocator, request_url: arena, page_base, request_url, - .{ .always_dupe = true, .encode = true }, + .{ .always_dupe = true, .encoding = originator.charset }, ); break :blk .{ u, false }; }; @@ -1196,7 +1196,7 @@ pub fn iframeAddedCallback(self: *Page, iframe: *IFrame) !void { self.call_arena, // ok to use, page.navigate dupes this self.base(), src, - .{ .encode = true }, + .{ .encoding = self.charset }, ); }; diff --git a/src/browser/URL.zig b/src/browser/URL.zig index 6f8cbebd..532f11a1 100644 --- a/src/browser/URL.zig +++ b/src/browser/URL.zig @@ -19,16 +19,19 @@ const std = @import("std"); const Allocator = std.mem.Allocator; -const ResolveOpts = struct { - encode: bool = false, +pub const ResolveOpts = struct { + /// null = don't encode, "UTF-8" = standard percent encoding, + /// other charset = encode query string using that charset with NCR fallback + encoding: ?[]const u8 = null, always_dupe: bool = false, }; // path is anytype, so that it can be used with both []const u8 and [:0]const u8 -pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, comptime opts: ResolveOpts) ![:0]const u8 { +pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, opts: ResolveOpts) ![:0]const u8 { const PT = @TypeOf(source_path); - var path: [:0]const u8 = if (comptime !isNullTerminated(PT) or opts.always_dupe) try allocator.dupeZ(u8, source_path) else source_path; + const needs_dupe = comptime !isNullTerminated(PT); + var path: [:0]const u8 = if (needs_dupe or opts.always_dupe) try allocator.dupeZ(u8, source_path) else source_path; if (base.len == 0) { return processResolved(allocator, path, opts); @@ -186,14 +189,12 @@ pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, c return processResolved(allocator, out[0..out_i :0], opts); } -fn processResolved(allocator: Allocator, url: [:0]const u8, comptime opts: ResolveOpts) ![:0]const u8 { - if (!comptime opts.encode) { - return url; - } - return ensureEncoded(allocator, url); +fn processResolved(allocator: Allocator, url: [:0]const u8, opts: ResolveOpts) ![:0]const u8 { + const encoding = opts.encoding orelse return url; + return ensureEncoded(allocator, url, encoding); } -pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 { +pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8, encoding: []const u8) ![:0]const u8 { const scheme_end = std.mem.indexOf(u8, url, "://"); const authority_start = if (scheme_end) |end| end + 3 else 0; const path_start = std.mem.indexOfScalarPos(u8, url, authority_start, '/') orelse return url; @@ -205,18 +206,18 @@ pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 { const query_end = if (query_start) |_| (fragment_start orelse url.len) else path_end; const path_to_encode = url[path_start..path_end]; + // Path is always UTF-8 percent encoded per URL spec const encoded_path = try percentEncodeSegment(allocator, path_to_encode, .path); + // Query string uses document encoding const encoded_query = if (query_start) |qs| blk: { const query_to_encode = url[qs + 1 .. query_end]; - const encoded = try percentEncodeSegment(allocator, query_to_encode, .query); - break :blk encoded; + break :blk try encodeQueryString(allocator, query_to_encode, encoding); } else null; const encoded_fragment = if (fragment_start) |fs| blk: { const fragment_to_encode = url[fs + 1 ..]; - const encoded = try percentEncodeSegment(allocator, fragment_to_encode, .query); - break :blk encoded; + break :blk try percentEncodeSegment(allocator, fragment_to_encode, .query); } else null; if (encoded_path.ptr == path_to_encode.ptr and @@ -242,7 +243,7 @@ pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 { return buf.items[0 .. buf.items.len - 1 :0]; } -const EncodeSet = enum { path, query, userinfo, fragment }; +const EncodeSet = enum { path, query, query_legacy, userinfo, fragment }; fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime encode_set: EncodeSet) ![]const u8 { // Check if encoding is needed @@ -285,17 +286,65 @@ fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime enco return buf.items; } +const h5e = @import("parser/html5ever.zig"); + +/// Encode a query string using the specified encoding. +/// For UTF-8, this is standard percent encoding. +/// For legacy encodings, unmappable characters are replaced with NCRs (&#codepoint;). +fn encodeQueryString(allocator: Allocator, query: []const u8, encoding: []const u8) ![]const u8 { + // For UTF-8, use standard percent encoding + if (std.mem.eql(u8, encoding, "UTF-8")) { + return percentEncodeSegment(allocator, query, .query); + } + + // For legacy encodings, first encode to the target charset with NCR fallback + const enc_info = h5e.encoding_for_label(encoding.ptr, encoding.len); + if (!enc_info.isValid()) { + // Unknown encoding, fall back to UTF-8 + return percentEncodeSegment(allocator, query, .query); + } + + // Calculate max buffer size for encoded output + const max_encoded_len = h5e.encoding_max_encode_buffer_length(enc_info.handle.?, query.len); + if (max_encoded_len == 0) { + return percentEncodeSegment(allocator, query, .query); + } + + const encode_buf = try allocator.alloc(u8, max_encoded_len); + defer allocator.free(encode_buf); + + // Encode UTF-8 to legacy encoding with NCR fallback + const result = h5e.encoding_encode_with_ncr( + enc_info.handle.?, + query.ptr, + query.len, + encode_buf.ptr, + encode_buf.len, + ); + + if (!result.isSuccess()) { + // Encoding failed, fall back to UTF-8 + return percentEncodeSegment(allocator, query, .query); + } + + // Now percent-encode the result using query_legacy to preserve NCRs + const encoded_bytes = encode_buf[0..result.bytes_written]; + return percentEncodeSegment(allocator, encoded_bytes, .query_legacy); +} + fn shouldPercentEncode(c: u8, comptime encode_set: EncodeSet) bool { return switch (c) { // Unreserved characters (RFC 3986) 'A'...'Z', 'a'...'z', '0'...'9', '-', '.', '_', '~' => false, - // sub-delims allowed in path/query but some must be encoded in userinfo - '!', '$', '&', '\'', '(', ')', '*', '+', ',' => false, - ';', '=' => encode_set == .userinfo, + // sub-delims allowed in path/query but some must be encoded in userinfo/query_legacy + '!', '$', '\'', '(', ')', '*', '+', ',' => false, + // '&' and ';' must be encoded for legacy encoding (to preserve NCRs like &#nnnnn;) + '&', ';' => encode_set == .userinfo or encode_set == .query_legacy, + '=' => encode_set == .userinfo, // Separators: userinfo must encode these '/', ':', '@' => encode_set == .userinfo, // '?' is allowed in queries only - '?' => encode_set != .query, + '?' => encode_set != .query and encode_set != .query_legacy, // '#' is allowed in fragments only '#' => encode_set != .fragment, // Everything else needs encoding (including space) @@ -1130,7 +1179,7 @@ test "URL: ensureEncoded" { }; for (cases) |case| { - const result = try ensureEncoded(testing.arena_allocator, case.url); + const result = try ensureEncoded(testing.arena_allocator, case.url, "UTF-8"); try testing.expectString(case.expected, result); } } @@ -1296,7 +1345,7 @@ test "URL: resolve with encoding" { }; for (cases) |case| { - const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encode = true }); + const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encoding = "UTF-8" }); try testing.expectString(case.expected, result); } } diff --git a/src/browser/interactive.zig b/src/browser/interactive.zig index a0b4528a..225633c7 100644 --- a/src/browser/interactive.zig +++ b/src/browser/interactive.zig @@ -182,7 +182,7 @@ pub fn collectInteractiveElements( .id = el.getAttributeSafe(comptime .wrap("id")), .class = el.getAttributeSafe(comptime .wrap("class")), .href = if (el.getAttributeSafe(comptime .wrap("href"))) |href| - URL.resolve(arena, page.base(), href, .{ .encode = true }) catch href + URL.resolve(arena, page.base(), href, .{ .encoding = page.charset }) catch href else null, .input_type = getInputType(el), diff --git a/src/browser/markdown.zig b/src/browser/markdown.zig index 5a83dfdc..437dbee6 100644 --- a/src/browser/markdown.zig +++ b/src/browser/markdown.zig @@ -278,7 +278,8 @@ const Context = struct { } try self.writer.writeAll("]("); if (el.getAttributeSafe(comptime .wrap("src"))) |src| { - const absolute_src = URL.resolve(self.page.call_arena, self.page.base(), src, .{ .encode = true }) catch src; + const page = self.page; + const absolute_src = URL.resolve(page.call_arena, page.base(), src, .{ .encoding = page.charset }) catch src; try self.writer.writeAll(absolute_src); } try self.writer.writeAll(")"); @@ -286,13 +287,14 @@ const Context = struct { return; }, .anchor => { + const page = self.page; const info = analyzeContent(el.asNode()); const label = getAnchorLabel(el); const href_raw = el.getAttributeSafe(comptime .wrap("href")); if (!info.has_visible and label == null and href_raw == null) return; - const href = if (href_raw) |h| URL.resolve(self.page.call_arena, self.page.base(), h, .{ .encode = true }) catch h else null; + const href = if (href_raw) |h| URL.resolve(page.call_arena, page.base(), h, .{ .encoding = page.charset }) catch h else null; if (info.has_block) { try self.renderChildren(el.asNode()); diff --git a/src/browser/parser/html5ever.zig b/src/browser/parser/html5ever.zig index cb673789..829ac429 100644 --- a/src/browser/parser/html5ever.zig +++ b/src/browser/parser/html5ever.zig @@ -278,3 +278,27 @@ pub extern "c" fn encoding_decoder_decode( ) DecodeResult; pub extern "c" fn encoding_decoder_free(decoder: *anyopaque) void; + +// Encoding API (UTF-8 to legacy encoding with NCR fallback) +pub const EncodeResult = extern struct { + status: u8, + bytes_read: usize, + bytes_written: usize, + + pub fn isSuccess(self: *const EncodeResult) bool { + return self.status == 0; + } +}; + +pub extern "c" fn encoding_encode_with_ncr( + handle: *anyopaque, + input: ?[*]const u8, + input_len: usize, + output: [*]u8, + output_capacity: usize, +) EncodeResult; + +pub extern "c" fn encoding_max_encode_buffer_length( + handle: *anyopaque, + input_len: usize, +) usize; diff --git a/src/browser/structured_data.zig b/src/browser/structured_data.zig index 9b6e7fbe..cad1d9d8 100644 --- a/src/browser/structured_data.zig +++ b/src/browser/structured_data.zig @@ -288,7 +288,7 @@ fn collectLink( ) !void { const rel = el.getAttributeSafe(comptime .wrap("rel")) orelse return; const raw_href = el.getAttributeSafe(comptime .wrap("href")) orelse return; - const href = URL.resolve(arena, page.base(), raw_href, .{ .encode = true }) catch raw_href; + const href = URL.resolve(arena, page.base(), raw_href, .{ .encoding = page.charset }) catch raw_href; if (std.ascii.eqlIgnoreCase(rel, "alternate")) { try alternate.append(arena, .{ diff --git a/src/browser/tests/page/encoding.html b/src/browser/tests/page/encoding.html index 19e0134f..b740a465 100644 --- a/src/browser/tests/page/encoding.html +++ b/src/browser/tests/page/encoding.html @@ -77,3 +77,32 @@ }); } + + diff --git a/src/browser/webapi/Node.zig b/src/browser/webapi/Node.zig index 0e7c2ffe..5871abee 100644 --- a/src/browser/webapi/Node.zig +++ b/src/browser/webapi/Node.zig @@ -22,6 +22,7 @@ const String = @import("../../string.zig").String; const js = @import("../js/js.zig"); const Page = @import("../Page.zig"); +const URL = @import("../URL.zig"); const reflect = @import("../reflect.zig"); const EventTarget = @import("EventTarget.zig"); @@ -511,6 +512,18 @@ pub fn ownerPage(self: *const Node, default: *Page) *Page { return doc._page orelse default; } +pub const ResolveURLOpts = struct { + allocator: ?Allocator = null, +}; + +// Resolve a URL relative to this node's owning document. +// Uses the document's charset for query string encoding (with NCR fallback for unmappable chars). +pub fn resolveURL(self: *const Node, url: anytype, page: *Page, opts: ResolveURLOpts) ![:0]const u8 { + const owner_page = self.ownerPage(page); + const allocator = opts.allocator orelse page.call_arena; + return URL.resolve(allocator, owner_page.base(), url, .{ .encoding = owner_page.charset }); +} + pub fn isSameDocumentAs(self: *const Node, other: *const Node, page: *const Page) bool { // Get the root document for each node const self_doc = if (self._type == .document) self._type.document else self.ownerDocument(page); diff --git a/src/browser/webapi/element/html/Anchor.zig b/src/browser/webapi/element/html/Anchor.zig index 33c8bded..e4207e84 100644 --- a/src/browser/webapi/element/html/Anchor.zig +++ b/src/browser/webapi/element/html/Anchor.zig @@ -39,12 +39,11 @@ pub fn asNode(self: *Anchor) *Node { } pub fn getHref(self: *Anchor, page: *Page) ![]const u8 { - const element = self.asElement(); - const href = element.getAttributeSafe(comptime .wrap("href")) orelse return ""; + const href = self.asElement().getAttributeSafe(comptime .wrap("href")) orelse return ""; if (href.len == 0) { return ""; } - return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true }); + return self.asNode().resolveURL(href, page, .{}); } pub fn setHref(self: *Anchor, value: []const u8, page: *Page) !void { @@ -203,7 +202,7 @@ fn getResolvedHref(self: *Anchor, page: *Page) !?[:0]const u8 { if (href.len == 0) { return null; } - return try URL.resolve(page.call_arena, page.base(), href, .{}); + return try self.asNode().resolveURL(href, page, .{}); } pub const JsApi = struct { diff --git a/src/browser/webapi/element/html/Form.zig b/src/browser/webapi/element/html/Form.zig index e8857e48..6628306b 100644 --- a/src/browser/webapi/element/html/Form.zig +++ b/src/browser/webapi/element/html/Form.zig @@ -97,7 +97,7 @@ pub fn getAction(self: *Form, page: *Page) ![]const u8 { if (action.len == 0) { return page.url; } - return URL.resolve(page.call_arena, page.base(), action, .{ .encode = true }); + return element.asNode().resolveURL(action, page, .{}); } pub fn setAction(self: *Form, value: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/IFrame.zig b/src/browser/webapi/element/html/IFrame.zig index e596f4ac..3b276dcd 100644 --- a/src/browser/webapi/element/html/IFrame.zig +++ b/src/browser/webapi/element/html/IFrame.zig @@ -48,9 +48,9 @@ pub fn getContentDocument(self: *const IFrame) ?*Document { return window._document; } -pub fn getSrc(self: *const IFrame, page: *Page) ![:0]const u8 { +pub fn getSrc(self: *IFrame, page: *Page) ![:0]const u8 { if (self._src.len == 0) return ""; - return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true }); + return self.asNode().resolveURL(self._src, page, .{}); } pub fn setSrc(self: *IFrame, src: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/Image.zig b/src/browser/webapi/element/html/Image.zig index b6731144..e3b57cd9 100644 --- a/src/browser/webapi/element/html/Image.zig +++ b/src/browser/webapi/element/html/Image.zig @@ -40,9 +40,7 @@ pub fn getSrc(self: *const Image, page: *Page) ![]const u8 { if (src.len == 0) { return ""; } - - // Always resolve the src against the page URL - return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true }); + return element.asConstNode().resolveURL(src, page, .{}); } pub fn setSrc(self: *Image, value: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/Link.zig b/src/browser/webapi/element/html/Link.zig index ed3839f2..5b6ce0c6 100644 --- a/src/browser/webapi/element/html/Link.zig +++ b/src/browser/webapi/element/html/Link.zig @@ -44,9 +44,7 @@ pub fn getHref(self: *Link, page: *Page) ![]const u8 { if (href.len == 0) { return ""; } - - // Always resolve the href against the page URL - return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true }); + return element.asNode().resolveURL(href, page, .{}); } pub fn setHref(self: *Link, value: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/Media.zig b/src/browser/webapi/element/html/Media.zig index 71013e71..6d62013f 100644 --- a/src/browser/webapi/element/html/Media.zig +++ b/src/browser/webapi/element/html/Media.zig @@ -235,8 +235,7 @@ pub fn getSrc(self: *const Media, page: *Page) ![]const u8 { if (src.len == 0) { return ""; } - const URL = @import("../../URL.zig"); - return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true }); + return element.asConstNode().resolveURL(src, page, .{}); } pub fn setSrc(self: *Media, value: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/Script.zig b/src/browser/webapi/element/html/Script.zig index d5e83b4f..77b6b7ef 100644 --- a/src/browser/webapi/element/html/Script.zig +++ b/src/browser/webapi/element/html/Script.zig @@ -45,9 +45,9 @@ pub fn asNode(self: *Script) *Node { return self.asElement().asNode(); } -pub fn getSrc(self: *const Script, page: *Page) ![]const u8 { +pub fn getSrc(self: *Script, page: *Page) ![]const u8 { if (self._src.len == 0) return ""; - return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true }); + return self.asNode().resolveURL(self._src, page, .{}); } pub fn setSrc(self: *Script, src: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/Video.zig b/src/browser/webapi/element/html/Video.zig index 63ccda4a..8fabb3ae 100644 --- a/src/browser/webapi/element/html/Video.zig +++ b/src/browser/webapi/element/html/Video.zig @@ -57,9 +57,7 @@ pub fn getPoster(self: *const Video, page: *Page) ![]const u8 { if (poster.len == 0) { return ""; } - - const URL = @import("../../URL.zig"); - return URL.resolve(page.call_arena, page.base(), poster, .{ .encode = true }); + return element.asConstNode().resolveURL(poster, page, .{}); } pub fn setPoster(self: *Video, value: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/net/WebSocket.zig b/src/browser/webapi/net/WebSocket.zig index 1244a61e..c5228627 100644 --- a/src/browser/webapi/net/WebSocket.zig +++ b/src/browser/webapi/net/WebSocket.zig @@ -108,7 +108,7 @@ pub fn init(url: []const u8, protocols_: ?[]const u8, page: *Page) !*WebSocket { const arena = try page.getArena(.{ .debug = "WebSocket" }); errdefer page.releaseArena(arena); - const resolved_url = try URL.resolve(arena, page.base(), url, .{ .always_dupe = true, .encode = true }); + const resolved_url = try URL.resolve(arena, page.base(), url, .{ .always_dupe = true, .encoding = page.charset }); const http_client = page._session.browser.http_client; const conn = http_client.network.newConnection() orelse { diff --git a/src/browser/webapi/net/XMLHttpRequest.zig b/src/browser/webapi/net/XMLHttpRequest.zig index 62e05a17..8a56d370 100644 --- a/src/browser/webapi/net/XMLHttpRequest.zig +++ b/src/browser/webapi/net/XMLHttpRequest.zig @@ -210,7 +210,7 @@ pub fn open(self: *XMLHttpRequest, method_: []const u8, url: [:0]const u8) !void const page = self._page; self._method = try parseMethod(method_); - self._url = try URL.resolve(self._arena, page.base(), url, .{ .always_dupe = true, .encode = true }); + self._url = try URL.resolve(self._arena, page.base(), url, .{ .always_dupe = true, .encoding = page.charset }); try self.stateChanged(.opened, page); } diff --git a/src/cdp/domains/page.zig b/src/cdp/domains/page.zig index 267cada8..beb86c6b 100644 --- a/src/cdp/domains/page.zig +++ b/src/cdp/domains/page.zig @@ -286,7 +286,7 @@ fn navigate(cmd: *CDP.Command) !void { page = try session.replacePage(); } - const encoded_url = try URL.ensureEncoded(page.call_arena, params.url); + const encoded_url = try URL.ensureEncoded(page.call_arena, params.url, "UTF-8"); try page.navigate(encoded_url, .{ .reason = .address_bar, .cdp_id = cmd.input.id, diff --git a/src/cdp/domains/target.zig b/src/cdp/domains/target.zig index bce7e00d..822659f7 100644 --- a/src/cdp/domains/target.zig +++ b/src/cdp/domains/target.zig @@ -220,7 +220,7 @@ fn createTarget(cmd: *CDP.Command) !void { } if (!std.mem.eql(u8, "about:blank", params.url)) { - const encoded_url = try URL.ensureEncoded(page.call_arena, params.url); + const encoded_url = try URL.ensureEncoded(page.call_arena, params.url, "UTF-8"); try page.navigate( encoded_url, .{ .reason = .address_bar, .kind = .{ .push = null } }, diff --git a/src/html5ever/lib.rs b/src/html5ever/lib.rs index c684c039..9d14e784 100644 --- a/src/html5ever/lib.rs +++ b/src/html5ever/lib.rs @@ -334,6 +334,120 @@ pub extern "C" fn encoding_decoder_free(decoder_ptr: *mut c_void) { } } +// === Encoding API (UTF-8 to legacy encoding with NCR fallback) === + +/// Result of encoding operation +#[repr(C)] +pub struct EncodeResult { + /// 0 = success, 1 = output buffer too small + pub status: u8, + /// Number of input bytes consumed + pub bytes_read: usize, + /// Number of bytes written to output buffer + pub bytes_written: usize, +} + +/// Encode UTF-8 to a legacy encoding, replacing unencodable characters with +/// HTML decimal numeric character references (&#codepoint;). +/// +/// This is used for URL query string encoding per WHATWG URL spec. +/// encoding_rs's encode_from_utf8 already produces NCRs for unmappable chars. +#[no_mangle] +pub extern "C" fn encoding_encode_with_ncr( + handle: *const c_void, + input: *const c_uchar, + input_len: usize, + output: *mut c_uchar, + output_capacity: usize, +) -> EncodeResult { + if handle.is_null() || output.is_null() { + return EncodeResult { + status: 1, + bytes_read: 0, + bytes_written: 0, + }; + } + + let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; + + let input_str = if input.is_null() || input_len == 0 { + "" + } else { + let bytes = unsafe { std::slice::from_raw_parts(input, input_len) }; + match std::str::from_utf8(bytes) { + Ok(s) => s, + Err(_) => { + return EncodeResult { + status: 1, + bytes_read: 0, + bytes_written: 0, + }; + } + } + }; + + // For UTF-8 encoding, just copy directly (no NCR needed) + if encoding == encoding_rs::UTF_8 { + if input_len > output_capacity { + return EncodeResult { + bytes_read: 0, + bytes_written: 0, + status: 1, + }; + } + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) }; + output_slice[..input_len].copy_from_slice(input_str.as_bytes()); + return EncodeResult { + bytes_read: input_len, + bytes_written: input_len, + status: 0, + }; + } + + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) }; + let mut encoder = encoding.new_encoder(); + + // encode_from_utf8 automatically produces NCRs for unmappable characters + let (result, bytes_read, bytes_written, _had_unmappables) = + encoder.encode_from_utf8(input_str, output_slice, true); + + match result { + encoding_rs::CoderResult::InputEmpty => EncodeResult { + bytes_read, + bytes_written, + status: 0, + }, + encoding_rs::CoderResult::OutputFull => EncodeResult { + bytes_read, + bytes_written, + status: 1, + }, + } +} + +/// Calculate maximum output buffer size needed for encoding with NCR fallback. +/// Worst case: every character becomes &#codepoint; where codepoint is up to 7 digits. +#[no_mangle] +pub extern "C" fn encoding_max_encode_buffer_length( + handle: *const c_void, + input_len: usize, +) -> usize { + if handle.is_null() { + return 0; + } + let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; + let encoder = encoding.new_encoder(); + // This returns the max buffer size accounting for NCR expansion + encoder + .max_buffer_length_from_utf8_if_no_unmappables(input_len) + .map(|len| { + // Add extra space for potential NCRs (each char could become &#nnnnnn; = 10 bytes) + // But realistically, most chars are mappable, so add 2x as safety margin + len.saturating_mul(2) + }) + .unwrap_or(input_len * 10) +} + #[no_mangle] pub extern "C" fn html5ever_parse_fragment( html: *mut c_uchar, diff --git a/src/lightpanda.zig b/src/lightpanda.zig index 4d6c23fb..b0356e93 100644 --- a/src/lightpanda.zig +++ b/src/lightpanda.zig @@ -107,7 +107,7 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void { // } // } - const encoded_url = try URL.ensureEncoded(page.call_arena, url); + const encoded_url = try URL.ensureEncoded(page.call_arena, url, "UTF-8"); _ = try page.navigate(encoded_url, .{ .reason = .address_bar, .kind = .{ .push = null },