Use the document's charset to determine if/how to encode querystring

Whenever we resolve a URL, say from `anchor.href`, we should consider the document's charset when encoding the querystring. This probably isn't the most important feature, but it makes tens of thousands of WPT cases pass, e.g /encoding/legacy-mb-tchinese/big5/big5-encode-href-errors-han.html?3001-4000 and /encoding/legacy-mb-japanese/euc-jp/eucjp-encode-href-errors-han.html?17001-18000 DOM elements previous called `URL.resolveURL(...)`. They now call `self.asNode().resolveURL(...)`, where `Node#resolveURL` will provide the document's charset.
2026-06-11 09:35:59 -04:00 · 2026-04-10 14:41:19 +08:00
parent f7c1710c23
commit 05229fdc53
23 changed files with 276 additions and 53 deletions
--- a/src/SemanticTree.zig
+++ b/src/SemanticTree.zig
@@ -671,7 +671,7 @@ pub fn getNodeDetails(

        if (el.getAttributeSafe(comptime .wrap("href"))) |h| {
            const URL = lp.URL;
-            href = URL.resolve(arena, page.base(), h, .{ .encode = true }) catch h;
+            href = URL.resolve(arena, page.base(), h, .{ .encoding = page.charset }) catch h;
        }

        if (el.is(Element.Html.Input)) |input| {
--- a/src/browser/Page.zig
+++ b/src/browser/Page.zig
@@ -661,7 +661,7 @@ fn scheduleNavigationWithArena(originator: *Page, arena: Allocator, request_url:
            arena,
            page_base,
            request_url,
-            .{ .always_dupe = true, .encode = true },
+            .{ .always_dupe = true, .encoding = originator.charset },
        );
        break :blk .{ u, false };
    };
@@ -1196,7 +1196,7 @@ pub fn iframeAddedCallback(self: *Page, iframe: *IFrame) !void {
            self.call_arena, // ok to use, page.navigate dupes this
            self.base(),
            src,
-            .{ .encode = true },
+            .{ .encoding = self.charset },
        );
    };

--- a/src/browser/URL.zig
+++ b/src/browser/URL.zig
@@ -19,16 +19,19 @@
 const std = @import("std");
 const Allocator = std.mem.Allocator;

-const ResolveOpts = struct {
-    encode: bool = false,
+pub const ResolveOpts = struct {
+    /// null = don't encode, "UTF-8" = standard percent encoding,
+    /// other charset = encode query string using that charset with NCR fallback
+    encoding: ?[]const u8 = null,
    always_dupe: bool = false,
 };

 // path is anytype, so that it can be used with both []const u8 and [:0]const u8
-pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, comptime opts: ResolveOpts) ![:0]const u8 {
+pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, opts: ResolveOpts) ![:0]const u8 {
    const PT = @TypeOf(source_path);

-    var path: [:0]const u8 = if (comptime !isNullTerminated(PT) or opts.always_dupe) try allocator.dupeZ(u8, source_path) else source_path;
+    const needs_dupe = comptime !isNullTerminated(PT);
+    var path: [:0]const u8 = if (needs_dupe or opts.always_dupe) try allocator.dupeZ(u8, source_path) else source_path;

    if (base.len == 0) {
        return processResolved(allocator, path, opts);
@@ -186,14 +189,12 @@ pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, c
    return processResolved(allocator, out[0..out_i :0], opts);
 }

-fn processResolved(allocator: Allocator, url: [:0]const u8, comptime opts: ResolveOpts) ![:0]const u8 {
-    if (!comptime opts.encode) {
-        return url;
-    }
-    return ensureEncoded(allocator, url);
+fn processResolved(allocator: Allocator, url: [:0]const u8, opts: ResolveOpts) ![:0]const u8 {
+    const encoding = opts.encoding orelse return url;
+    return ensureEncoded(allocator, url, encoding);
 }

-pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
+pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8, encoding: []const u8) ![:0]const u8 {
    const scheme_end = std.mem.indexOf(u8, url, "://");
    const authority_start = if (scheme_end) |end| end + 3 else 0;
    const path_start = std.mem.indexOfScalarPos(u8, url, authority_start, '/') orelse return url;
@@ -205,18 +206,18 @@ pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
    const query_end = if (query_start) |_| (fragment_start orelse url.len) else path_end;

    const path_to_encode = url[path_start..path_end];
+    // Path is always UTF-8 percent encoded per URL spec
    const encoded_path = try percentEncodeSegment(allocator, path_to_encode, .path);

+    // Query string uses document encoding
    const encoded_query = if (query_start) |qs| blk: {
        const query_to_encode = url[qs + 1 .. query_end];
-        const encoded = try percentEncodeSegment(allocator, query_to_encode, .query);
-        break :blk encoded;
+        break :blk try encodeQueryString(allocator, query_to_encode, encoding);
    } else null;

    const encoded_fragment = if (fragment_start) |fs| blk: {
        const fragment_to_encode = url[fs + 1 ..];
-        const encoded = try percentEncodeSegment(allocator, fragment_to_encode, .query);
-        break :blk encoded;
+        break :blk try percentEncodeSegment(allocator, fragment_to_encode, .query);
    } else null;

    if (encoded_path.ptr == path_to_encode.ptr and
@@ -242,7 +243,7 @@ pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
    return buf.items[0 .. buf.items.len - 1 :0];
 }

-const EncodeSet = enum { path, query, userinfo, fragment };
+const EncodeSet = enum { path, query, query_legacy, userinfo, fragment };

 fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime encode_set: EncodeSet) ![]const u8 {
    // Check if encoding is needed
@@ -285,17 +286,65 @@ fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime enco
    return buf.items;
 }

+const h5e = @import("parser/html5ever.zig");
+
+/// Encode a query string using the specified encoding.
+/// For UTF-8, this is standard percent encoding.
+/// For legacy encodings, unmappable characters are replaced with NCRs (&#codepoint;).
+fn encodeQueryString(allocator: Allocator, query: []const u8, encoding: []const u8) ![]const u8 {
+    // For UTF-8, use standard percent encoding
+    if (std.mem.eql(u8, encoding, "UTF-8")) {
+        return percentEncodeSegment(allocator, query, .query);
+    }
+
+    // For legacy encodings, first encode to the target charset with NCR fallback
+    const enc_info = h5e.encoding_for_label(encoding.ptr, encoding.len);
+    if (!enc_info.isValid()) {
+        // Unknown encoding, fall back to UTF-8
+        return percentEncodeSegment(allocator, query, .query);
+    }
+
+    // Calculate max buffer size for encoded output
+    const max_encoded_len = h5e.encoding_max_encode_buffer_length(enc_info.handle.?, query.len);
+    if (max_encoded_len == 0) {
+        return percentEncodeSegment(allocator, query, .query);
+    }
+
+    const encode_buf = try allocator.alloc(u8, max_encoded_len);
+    defer allocator.free(encode_buf);
+
+    // Encode UTF-8 to legacy encoding with NCR fallback
+    const result = h5e.encoding_encode_with_ncr(
+        enc_info.handle.?,
+        query.ptr,
+        query.len,
+        encode_buf.ptr,
+        encode_buf.len,
+    );
+
+    if (!result.isSuccess()) {
+        // Encoding failed, fall back to UTF-8
+        return percentEncodeSegment(allocator, query, .query);
+    }
+
+    // Now percent-encode the result using query_legacy to preserve NCRs
+    const encoded_bytes = encode_buf[0..result.bytes_written];
+    return percentEncodeSegment(allocator, encoded_bytes, .query_legacy);
+}
+
 fn shouldPercentEncode(c: u8, comptime encode_set: EncodeSet) bool {
    return switch (c) {
        // Unreserved characters (RFC 3986)
        'A'...'Z', 'a'...'z', '0'...'9', '-', '.', '_', '~' => false,
-        // sub-delims allowed in path/query but some must be encoded in userinfo
-        '!', '$', '&', '\'', '(', ')', '*', '+', ',' => false,
-        ';', '=' => encode_set == .userinfo,
+        // sub-delims allowed in path/query but some must be encoded in userinfo/query_legacy
+        '!', '$', '\'', '(', ')', '*', '+', ',' => false,
+        // '&' and ';' must be encoded for legacy encoding (to preserve NCRs like &#nnnnn;)
+        '&', ';' => encode_set == .userinfo or encode_set == .query_legacy,
+        '=' => encode_set == .userinfo,
        // Separators: userinfo must encode these
        '/', ':', '@' => encode_set == .userinfo,
        // '?' is allowed in queries only
-        '?' => encode_set != .query,
+        '?' => encode_set != .query and encode_set != .query_legacy,
        // '#' is allowed in fragments only
        '#' => encode_set != .fragment,
        // Everything else needs encoding (including space)
@@ -1130,7 +1179,7 @@ test "URL: ensureEncoded" {
    };

    for (cases) |case| {
-        const result = try ensureEncoded(testing.arena_allocator, case.url);
+        const result = try ensureEncoded(testing.arena_allocator, case.url, "UTF-8");
        try testing.expectString(case.expected, result);
    }
 }
@@ -1296,7 +1345,7 @@ test "URL: resolve with encoding" {
    };

    for (cases) |case| {
-        const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encode = true });
+        const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encoding = "UTF-8" });
        try testing.expectString(case.expected, result);
    }
 }
--- a/src/browser/interactive.zig
+++ b/src/browser/interactive.zig
@@ -182,7 +182,7 @@ pub fn collectInteractiveElements(
            .id = el.getAttributeSafe(comptime .wrap("id")),
            .class = el.getAttributeSafe(comptime .wrap("class")),
            .href = if (el.getAttributeSafe(comptime .wrap("href"))) |href|
-                URL.resolve(arena, page.base(), href, .{ .encode = true }) catch href
+                URL.resolve(arena, page.base(), href, .{ .encoding = page.charset }) catch href
            else
                null,
            .input_type = getInputType(el),
--- a/src/browser/markdown.zig
+++ b/src/browser/markdown.zig
@@ -278,7 +278,8 @@ const Context = struct {
                }
                try self.writer.writeAll("](");
                if (el.getAttributeSafe(comptime .wrap("src"))) |src| {
-                    const absolute_src = URL.resolve(self.page.call_arena, self.page.base(), src, .{ .encode = true }) catch src;
+                    const page = self.page;
+                    const absolute_src = URL.resolve(page.call_arena, page.base(), src, .{ .encoding = page.charset }) catch src;
                    try self.writer.writeAll(absolute_src);
                }
                try self.writer.writeAll(")");
@@ -286,13 +287,14 @@ const Context = struct {
                return;
            },
            .anchor => {
+                const page = self.page;
                const info = analyzeContent(el.asNode());
                const label = getAnchorLabel(el);
                const href_raw = el.getAttributeSafe(comptime .wrap("href"));

                if (!info.has_visible and label == null and href_raw == null) return;

-                const href = if (href_raw) |h| URL.resolve(self.page.call_arena, self.page.base(), h, .{ .encode = true }) catch h else null;
+                const href = if (href_raw) |h| URL.resolve(page.call_arena, page.base(), h, .{ .encoding = page.charset }) catch h else null;

                if (info.has_block) {
                    try self.renderChildren(el.asNode());
--- a/src/browser/parser/html5ever.zig
+++ b/src/browser/parser/html5ever.zig
@@ -278,3 +278,27 @@ pub extern "c" fn encoding_decoder_decode(
 ) DecodeResult;

 pub extern "c" fn encoding_decoder_free(decoder: *anyopaque) void;
+
+// Encoding API (UTF-8 to legacy encoding with NCR fallback)
+pub const EncodeResult = extern struct {
+    status: u8,
+    bytes_read: usize,
+    bytes_written: usize,
+
+    pub fn isSuccess(self: *const EncodeResult) bool {
+        return self.status == 0;
+    }
+};
+
+pub extern "c" fn encoding_encode_with_ncr(
+    handle: *anyopaque,
+    input: ?[*]const u8,
+    input_len: usize,
+    output: [*]u8,
+    output_capacity: usize,
+) EncodeResult;
+
+pub extern "c" fn encoding_max_encode_buffer_length(
+    handle: *anyopaque,
+    input_len: usize,
+) usize;
--- a/src/browser/structured_data.zig
+++ b/src/browser/structured_data.zig
@@ -288,7 +288,7 @@ fn collectLink(
 ) !void {
    const rel = el.getAttributeSafe(comptime .wrap("rel")) orelse return;
    const raw_href = el.getAttributeSafe(comptime .wrap("href")) orelse return;
-    const href = URL.resolve(arena, page.base(), raw_href, .{ .encode = true }) catch raw_href;
+    const href = URL.resolve(arena, page.base(), raw_href, .{ .encoding = page.charset }) catch raw_href;

    if (std.ascii.eqlIgnoreCase(rel, "alternate")) {
        try alternate.append(arena, .{
--- a/src/browser/tests/page/encoding.html
+++ b/src/browser/tests/page/encoding.html
@@ -77,3 +77,32 @@
    });
  }
 </script>
+
+<script id="anchor_href_encoding_with_ncr">
+  {
+    // Test that anchor.href encodes unmappable characters as NCRs in non-UTF-8 documents.
+    // When a character can't be represented in the document's encoding, it should become &#nnnnn;
+    // Per WHATWG URL Standard, query strings use document encoding with NCR fallback.
+    const iframe = document.createElement('iframe');
+    document.body.appendChild(iframe);
+    iframe.src = 'encoding/gbk.html';
+
+    testing.onload(() => {
+      testing.expectEqual('GBK', iframe.contentDocument.characterSet);
+
+      // Test 1: U+3D34 (㴴) - a Han character NOT in GBK, should become NCR &#15668;
+      const anchor = iframe.contentDocument.createElement('a');
+      iframe.contentDocument.body.appendChild(anchor);
+      anchor.href = 'http://example.com/?q=\u3D34';
+      // The NCR &#15668; percent-encoded is %26%2315668%3B
+      testing.expectEqual('http://example.com/?q=%26%2315668%3B', anchor.href);
+
+      // Test 2: U+4E2D (中) - IS in GBK, should encode to GBK bytes D6D0 then percent-encode
+      const anchor2 = iframe.contentDocument.createElement('a');
+      iframe.contentDocument.body.appendChild(anchor2);
+      anchor2.href = 'http://example.com/?q=\u4E2D';
+      // GBK encoding of 中 is D6 D0, percent-encoded as %D6%D0
+      testing.expectEqual('http://example.com/?q=%D6%D0', anchor2.href);
+    });
+  }
+</script>
--- a/src/browser/webapi/Node.zig
+++ b/src/browser/webapi/Node.zig
@@ -22,6 +22,7 @@ const String = @import("../../string.zig").String;

 const js = @import("../js/js.zig");
 const Page = @import("../Page.zig");
+const URL = @import("../URL.zig");
 const reflect = @import("../reflect.zig");

 const EventTarget = @import("EventTarget.zig");
@@ -511,6 +512,18 @@ pub fn ownerPage(self: *const Node, default: *Page) *Page {
    return doc._page orelse default;
 }

+pub const ResolveURLOpts = struct {
+    allocator: ?Allocator = null,
+};
+
+// Resolve a URL relative to this node's owning document.
+// Uses the document's charset for query string encoding (with NCR fallback for unmappable chars).
+pub fn resolveURL(self: *const Node, url: anytype, page: *Page, opts: ResolveURLOpts) ![:0]const u8 {
+    const owner_page = self.ownerPage(page);
+    const allocator = opts.allocator orelse page.call_arena;
+    return URL.resolve(allocator, owner_page.base(), url, .{ .encoding = owner_page.charset });
+}
+
 pub fn isSameDocumentAs(self: *const Node, other: *const Node, page: *const Page) bool {
    // Get the root document for each node
    const self_doc = if (self._type == .document) self._type.document else self.ownerDocument(page);
--- a/src/browser/webapi/element/html/Anchor.zig
+++ b/src/browser/webapi/element/html/Anchor.zig
@@ -39,12 +39,11 @@ pub fn asNode(self: *Anchor) *Node {
 }

 pub fn getHref(self: *Anchor, page: *Page) ![]const u8 {
-    const element = self.asElement();
-    const href = element.getAttributeSafe(comptime .wrap("href")) orelse return "";
+    const href = self.asElement().getAttributeSafe(comptime .wrap("href")) orelse return "";
    if (href.len == 0) {
        return "";
    }
-    return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true });
+    return self.asNode().resolveURL(href, page, .{});
 }

 pub fn setHref(self: *Anchor, value: []const u8, page: *Page) !void {
@@ -203,7 +202,7 @@ fn getResolvedHref(self: *Anchor, page: *Page) !?[:0]const u8 {
    if (href.len == 0) {
        return null;
    }
-    return try URL.resolve(page.call_arena, page.base(), href, .{});
+    return try self.asNode().resolveURL(href, page, .{});
 }

 pub const JsApi = struct {
--- a/src/browser/webapi/element/html/Form.zig
+++ b/src/browser/webapi/element/html/Form.zig
@@ -97,7 +97,7 @@ pub fn getAction(self: *Form, page: *Page) ![]const u8 {
    if (action.len == 0) {
        return page.url;
    }
-    return URL.resolve(page.call_arena, page.base(), action, .{ .encode = true });
+    return element.asNode().resolveURL(action, page, .{});
 }

 pub fn setAction(self: *Form, value: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/IFrame.zig
+++ b/src/browser/webapi/element/html/IFrame.zig
@@ -48,9 +48,9 @@ pub fn getContentDocument(self: *const IFrame) ?*Document {
    return window._document;
 }

-pub fn getSrc(self: *const IFrame, page: *Page) ![:0]const u8 {
+pub fn getSrc(self: *IFrame, page: *Page) ![:0]const u8 {
    if (self._src.len == 0) return "";
-    return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true });
+    return self.asNode().resolveURL(self._src, page, .{});
 }

 pub fn setSrc(self: *IFrame, src: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/Image.zig
+++ b/src/browser/webapi/element/html/Image.zig
@@ -40,9 +40,7 @@ pub fn getSrc(self: *const Image, page: *Page) ![]const u8 {
    if (src.len == 0) {
        return "";
    }
-
-    // Always resolve the src against the page URL
-    return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true });
+    return element.asConstNode().resolveURL(src, page, .{});
 }

 pub fn setSrc(self: *Image, value: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/Link.zig
+++ b/src/browser/webapi/element/html/Link.zig
@@ -44,9 +44,7 @@ pub fn getHref(self: *Link, page: *Page) ![]const u8 {
    if (href.len == 0) {
        return "";
    }
-
-    // Always resolve the href against the page URL
-    return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true });
+    return element.asNode().resolveURL(href, page, .{});
 }

 pub fn setHref(self: *Link, value: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/Media.zig
+++ b/src/browser/webapi/element/html/Media.zig
@@ -235,8 +235,7 @@ pub fn getSrc(self: *const Media, page: *Page) ![]const u8 {
    if (src.len == 0) {
        return "";
    }
-    const URL = @import("../../URL.zig");
-    return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true });
+    return element.asConstNode().resolveURL(src, page, .{});
 }

 pub fn setSrc(self: *Media, value: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/Script.zig
+++ b/src/browser/webapi/element/html/Script.zig
@@ -45,9 +45,9 @@ pub fn asNode(self: *Script) *Node {
    return self.asElement().asNode();
 }

-pub fn getSrc(self: *const Script, page: *Page) ![]const u8 {
+pub fn getSrc(self: *Script, page: *Page) ![]const u8 {
    if (self._src.len == 0) return "";
-    return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true });
+    return self.asNode().resolveURL(self._src, page, .{});
 }

 pub fn setSrc(self: *Script, src: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/Video.zig
+++ b/src/browser/webapi/element/html/Video.zig
@@ -57,9 +57,7 @@ pub fn getPoster(self: *const Video, page: *Page) ![]const u8 {
    if (poster.len == 0) {
        return "";
    }
-
-    const URL = @import("../../URL.zig");
-    return URL.resolve(page.call_arena, page.base(), poster, .{ .encode = true });
+    return element.asConstNode().resolveURL(poster, page, .{});
 }

 pub fn setPoster(self: *Video, value: []const u8, page: *Page) !void {
--- a/src/browser/webapi/net/WebSocket.zig
+++ b/src/browser/webapi/net/WebSocket.zig
@@ -108,7 +108,7 @@ pub fn init(url: []const u8, protocols_: ?[]const u8, page: *Page) !*WebSocket {
    const arena = try page.getArena(.{ .debug = "WebSocket" });
    errdefer page.releaseArena(arena);

-    const resolved_url = try URL.resolve(arena, page.base(), url, .{ .always_dupe = true, .encode = true });
+    const resolved_url = try URL.resolve(arena, page.base(), url, .{ .always_dupe = true, .encoding = page.charset });

    const http_client = page._session.browser.http_client;
    const conn = http_client.network.newConnection() orelse {
--- a/src/browser/webapi/net/XMLHttpRequest.zig
+++ b/src/browser/webapi/net/XMLHttpRequest.zig
@@ -210,7 +210,7 @@ pub fn open(self: *XMLHttpRequest, method_: []const u8, url: [:0]const u8) !void

    const page = self._page;
    self._method = try parseMethod(method_);
-    self._url = try URL.resolve(self._arena, page.base(), url, .{ .always_dupe = true, .encode = true });
+    self._url = try URL.resolve(self._arena, page.base(), url, .{ .always_dupe = true, .encoding = page.charset });
    try self.stateChanged(.opened, page);
 }

--- a/src/cdp/domains/page.zig
+++ b/src/cdp/domains/page.zig
@@ -286,7 +286,7 @@ fn navigate(cmd: *CDP.Command) !void {
        page = try session.replacePage();
    }

-    const encoded_url = try URL.ensureEncoded(page.call_arena, params.url);
+    const encoded_url = try URL.ensureEncoded(page.call_arena, params.url, "UTF-8");
    try page.navigate(encoded_url, .{
        .reason = .address_bar,
        .cdp_id = cmd.input.id,
--- a/src/cdp/domains/target.zig
+++ b/src/cdp/domains/target.zig
@@ -220,7 +220,7 @@ fn createTarget(cmd: *CDP.Command) !void {
    }

    if (!std.mem.eql(u8, "about:blank", params.url)) {
-        const encoded_url = try URL.ensureEncoded(page.call_arena, params.url);
+        const encoded_url = try URL.ensureEncoded(page.call_arena, params.url, "UTF-8");
        try page.navigate(
            encoded_url,
            .{ .reason = .address_bar, .kind = .{ .push = null } },
--- a/src/html5ever/lib.rs
+++ b/src/html5ever/lib.rs
@@ -334,6 +334,120 @@ pub extern "C" fn encoding_decoder_free(decoder_ptr: *mut c_void) {
    }
 }

+// === Encoding API (UTF-8 to legacy encoding with NCR fallback) ===
+
+/// Result of encoding operation
+#[repr(C)]
+pub struct EncodeResult {
+    /// 0 = success, 1 = output buffer too small
+    pub status: u8,
+    /// Number of input bytes consumed
+    pub bytes_read: usize,
+    /// Number of bytes written to output buffer
+    pub bytes_written: usize,
+}
+
+/// Encode UTF-8 to a legacy encoding, replacing unencodable characters with
+/// HTML decimal numeric character references (&#codepoint;).
+///
+/// This is used for URL query string encoding per WHATWG URL spec.
+/// encoding_rs's encode_from_utf8 already produces NCRs for unmappable chars.
+#[no_mangle]
+pub extern "C" fn encoding_encode_with_ncr(
+    handle: *const c_void,
+    input: *const c_uchar,
+    input_len: usize,
+    output: *mut c_uchar,
+    output_capacity: usize,
+) -> EncodeResult {
+    if handle.is_null() || output.is_null() {
+        return EncodeResult {
+            status: 1,
+            bytes_read: 0,
+            bytes_written: 0,
+        };
+    }
+
+    let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
+
+    let input_str = if input.is_null() || input_len == 0 {
+        ""
+    } else {
+        let bytes = unsafe { std::slice::from_raw_parts(input, input_len) };
+        match std::str::from_utf8(bytes) {
+            Ok(s) => s,
+            Err(_) => {
+                return EncodeResult {
+                    status: 1,
+                    bytes_read: 0,
+                    bytes_written: 0,
+                };
+            }
+        }
+    };
+
+    // For UTF-8 encoding, just copy directly (no NCR needed)
+    if encoding == encoding_rs::UTF_8 {
+        if input_len > output_capacity {
+            return EncodeResult {
+                bytes_read: 0,
+                bytes_written: 0,
+                status: 1,
+            };
+        }
+        let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) };
+        output_slice[..input_len].copy_from_slice(input_str.as_bytes());
+        return EncodeResult {
+            bytes_read: input_len,
+            bytes_written: input_len,
+            status: 0,
+        };
+    }
+
+    let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) };
+    let mut encoder = encoding.new_encoder();
+
+    // encode_from_utf8 automatically produces NCRs for unmappable characters
+    let (result, bytes_read, bytes_written, _had_unmappables) =
+        encoder.encode_from_utf8(input_str, output_slice, true);
+
+    match result {
+        encoding_rs::CoderResult::InputEmpty => EncodeResult {
+            bytes_read,
+            bytes_written,
+            status: 0,
+        },
+        encoding_rs::CoderResult::OutputFull => EncodeResult {
+            bytes_read,
+            bytes_written,
+            status: 1,
+        },
+    }
+}
+
+/// Calculate maximum output buffer size needed for encoding with NCR fallback.
+/// Worst case: every character becomes &#codepoint; where codepoint is up to 7 digits.
+#[no_mangle]
+pub extern "C" fn encoding_max_encode_buffer_length(
+    handle: *const c_void,
+    input_len: usize,
+) -> usize {
+    if handle.is_null() {
+        return 0;
+    }
+    let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
+    let encoder = encoding.new_encoder();
+    // This returns the max buffer size accounting for NCR expansion
+    encoder
+        .max_buffer_length_from_utf8_if_no_unmappables(input_len)
+        .map(|len| {
+            // Add extra space for potential NCRs (each char could become &#nnnnnn; = 10 bytes)
+            // But realistically, most chars are mappable, so add 2x as safety margin
+            len.saturating_mul(2)
+        })
+        .unwrap_or(input_len * 10)
+}
+
 #[no_mangle]
 pub extern "C" fn html5ever_parse_fragment(
    html: *mut c_uchar,
--- a/src/lightpanda.zig
+++ b/src/lightpanda.zig
@@ -107,7 +107,7 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void {
    //     }
    // }

-    const encoded_url = try URL.ensureEncoded(page.call_arena, url);
+    const encoded_url = try URL.ensureEncoded(page.call_arena, url, "UTF-8");
    _ = try page.navigate(encoded_url, .{
        .reason = .address_bar,
        .kind = .{ .push = null },