From 828715b751b19190f36c13f5e9fdcb7dee308730 Mon Sep 17 00:00:00 2001
From: Karl Seguin <k@openmymind.io>
Date: Fri, 10 Apr 2026 10:36:29 +0800
Subject: [PATCH 1/5] Improve TextDecoder to support all necessary encoding
 types

Uses the newly added encoding_rs to implement TextDecoder for all encoding.
Claude wrote 100% of the Rust binding.

Improves various WPT tests, e.g. /encoding/api-basics.any.html.
---
 src/browser/parser/html5ever.zig             |  62 +++++++
 src/browser/tests/encoding/text_decoder.html |  73 +++++++-
 src/browser/tests/encoding/text_encoder.html |   3 +
 src/browser/tests/testing.js                 |   8 +-
 src/browser/webapi/encoding/TextDecoder.zig  | 149 ++++++++++++---
 src/browser/webapi/encoding/TextEncoder.zig  |  19 +-
 src/html5ever/lib.rs                         | 183 +++++++++++++++++++
 7 files changed, 458 insertions(+), 39 deletions(-)
diff --git a/src/browser/parser/html5ever.zig b/src/browser/parser/html5ever.zig
index f6f81583..cb673789 100644
--- a/src/browser/parser/html5ever.zig
+++ b/src/browser/parser/html5ever.zig
@@ -216,3 +216,65 @@ pub extern "c" fn xml5ever_parse_document(
     appendBeforeSiblingCallback: *const fn (ctx: *anyopaque, sibling_ref: *anyopaque, NodeOrText) callconv(.c) void,
     appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
 ) void;
+
+// General encoding api
+pub const EncodingInfo = extern struct {
+    found: u8,
+    handle: ?*anyopaque,
+    name_len: usize,
+    name_ptr: [*]const u8,
+
+    pub fn isValid(self: *const EncodingInfo) bool {
+        return self.found != 0;
+    }
+
+    pub fn name(self: *const EncodingInfo) []const u8 {
+        if (self.name_len == 0) {
+            return "";
+        }
+        return self.name_ptr[0..self.name_len];
+    }
+};
+
+pub const DecodeResult = extern struct {
+    had_errors: u8,
+    bytes_read: usize,
+    bytes_written: usize,
+
+    pub fn hadErrors(self: *const DecodeResult) bool {
+        return self.had_errors != 0;
+    }
+};
+
+pub extern "c" fn encoding_for_label(
+    label: [*]const u8,
+    label_len: usize,
+) EncodingInfo;
+
+pub extern "c" fn encoding_max_utf8_buffer_length(
+    handle: *anyopaque,
+    input_len: usize,
+) usize;
+
+pub extern "c" fn encoding_decode(
+    handle: *anyopaque,
+    input: ?[*]const u8,
+    input_len: usize,
+    output: [*]u8,
+    output_len: usize,
+    is_last: u8,
+) DecodeResult;
+
+// Streaming decoder API
+pub extern "c" fn encoding_decoder_new(handle: *anyopaque) ?*anyopaque;
+
+pub extern "c" fn encoding_decoder_decode(
+    decoder: *anyopaque,
+    input: ?[*]const u8,
+    input_len: usize,
+    output: [*]u8,
+    output_len: usize,
+    is_last: u8,
+) DecodeResult;
+
+pub extern "c" fn encoding_decoder_free(decoder: *anyopaque) void;
diff --git a/src/browser/tests/encoding/text_decoder.html b/src/browser/tests/encoding/text_decoder.html
index 2b01852e..6314c924 100644
--- a/src/browser/tests/encoding/text_decoder.html
+++ b/src/browser/tests/encoding/text_decoder.html
@@ -11,7 +11,6 @@
   testing.expectEqual('', d1.decode());
   testing.expectEqual('香料', d1.decode(new Uint8Array([233, 166, 153, 230, 150, 153])));
   testing.expectEqual('香料', d1.decode(new Uint8Array([0xEF, 0xBB, 0xBF, 233, 166, 153, 230, 150, 153])));
-  testing.expectEqual('�4', d1.decode(new Uint8Array([249, 52])));
 
   {
     const buffer = new ArrayBuffer(6);
@@ -38,7 +37,7 @@
   }
 
   let d2 = new TextDecoder('utf8', {fatal: true})
-  testing.expectError('Error: InvalidUtf8', () => {
+  testing.expectError('TypeError', () => {
     let data  = new Uint8Array([241, 241, 159, 172]);
     d2.decode(data);
   });
@@ -46,8 +45,8 @@
 
 <script id=stream>
   let d3 = new TextDecoder();
-  testing.expectEqual('', d2.decode(new Uint8Array([226, 153]), { stream: true }));
-  testing.expectEqual('♥', d2.decode(new Uint8Array([165]), { stream: true }));
+  testing.expectEqual('', d3.decode(new Uint8Array([226, 153]), { stream: true }));
+  testing.expectEqual('♥', d3.decode(new Uint8Array([165]), { stream: true }));
 </script>
 
 <script id=slice>
@@ -60,5 +59,69 @@
   arr1[4] = 84;
   arr1[5] = 85;
   arr1[6] = 86;
-  testing.expectEqual('RST', d3.decode(new Uint8Array(buf1, 2, 3)));
+  let d4 = new TextDecoder();
+  testing.expectEqual('RST', d4.decode(new Uint8Array(buf1, 2, 3)));
+</script>
+
+<script id=legacy_encodings>
+  // GBK (Chinese)
+  let gbk = new TextDecoder('gbk');
+  testing.expectEqual('gbk', gbk.encoding);
+  testing.expectEqual('中文', gbk.decode(new Uint8Array([0xD6, 0xD0, 0xCE, 0xC4])));
+
+  // Shift_JIS (Japanese)
+  let sjis = new TextDecoder('shift_jis');
+  testing.expectEqual('shift_jis', sjis.encoding);
+  testing.expectEqual('日本語', sjis.decode(new Uint8Array([0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA])));
+
+  // EUC-JP (Japanese)
+  let eucjp = new TextDecoder('euc-jp');
+  testing.expectEqual('euc-jp', eucjp.encoding);
+
+  // ISO-8859-1 (Latin-1)
+  let latin1 = new TextDecoder('iso-8859-1');
+  testing.expectEqual('windows-1252', latin1.encoding); // ISO-8859-1 maps to windows-1252 per spec
+  testing.expectEqual('café', latin1.decode(new Uint8Array([0x63, 0x61, 0x66, 0xe9])));
+
+  // Big5 (Traditional Chinese)
+  let big5 = new TextDecoder('big5');
+  testing.expectEqual('big5', big5.encoding);
+
+  // UTF-16LE
+  let utf16le = new TextDecoder('utf-16le');
+  testing.expectEqual('utf-16le', utf16le.encoding);
+  testing.expectEqual('AB', utf16le.decode(new Uint8Array([0x41, 0x00, 0x42, 0x00])));
+
+  // UTF-16BE
+  let utf16be = new TextDecoder('utf-16be');
+  testing.expectEqual('utf-16be', utf16be.encoding);
+  testing.expectEqual('AB', utf16be.decode(new Uint8Array([0x00, 0x41, 0x00, 0x42])));
+</script>
+
+<script id=invalid_label>
+  // Test invalid encoding label
+  try {
+    new TextDecoder('invalid-encoding');
+    testing.fail();
+  } catch (e) {
+    testing.expectEqual(true, e.toString().includes('RangeError'));
+  }
+
+  // Test 'replacement' encoding is rejected
+  try {
+    new TextDecoder('replacement');
+    testing.fail();
+  } catch (e) {
+    testing.expectEqual(true, e.toString().includes('RangeError'));
+  }
+</script>
+
+<script id=label_variations>
+  // Case insensitive
+  let upper = new TextDecoder('UTF-8');
+  testing.expectEqual('utf-8', upper.encoding);
+
+  // Leading/trailing whitespace
+  let ws = new TextDecoder('  utf-8  ');
+  testing.expectEqual('utf-8', ws.encoding);
 </script>
diff --git a/src/browser/tests/encoding/text_encoder.html b/src/browser/tests/encoding/text_encoder.html
index 540f60c1..99fd1959 100644
--- a/src/browser/tests/encoding/text_encoder.html
+++ b/src/browser/tests/encoding/text_encoder.html
@@ -5,6 +5,9 @@
 <script id=TextEncoder>
   var encoder = new TextEncoder();
   testing.expectEqual('utf-8', encoder.encoding);
+  testing.expectEqual([], Array.from(encoder.encode()));
+  testing.expectEqual([110, 117, 108, 108], Array.from(encoder.encode(null)));
+  testing.expectEqual([], Array.from(encoder.encode(undefined)));
   testing.expectEqual([226, 130, 172], Array.from(encoder.encode('€')));
   testing.expectEqual([111,118,101,114,32,57,48,48,48], encoder.encode("over 9000"));
 </script>
diff --git a/src/browser/tests/testing.js b/src/browser/tests/testing.js
index 037d15cf..12d0f761 100644
--- a/src/browser/tests/testing.js
+++ b/src/browser/tests/testing.js
@@ -37,7 +37,13 @@
 
   function expectError(expected, fn) {
     withError((err) => {
-      expectEqual(true, err.toString().includes(expected));
+      if (!err.toString().includes(expected)) {
+        console.error(`Expecte error to contains: ${expected}, was: ${err.toString()}`);
+        expectEqual(true, false);
+      } else {
+        // to record a successful case
+        expectTrue(true);
+      }
     }, fn);
   }
 
diff --git a/src/browser/webapi/encoding/TextDecoder.zig b/src/browser/webapi/encoding/TextDecoder.zig
index c117df09..1467aa86 100644
--- a/src/browser/webapi/encoding/TextDecoder.zig
+++ b/src/browser/webapi/encoding/TextDecoder.zig
@@ -1,4 +1,4 @@
-// Copyright (C) 2023-2025  Lightpanda (Selecy SAS)
+// Copyright (C) 2023-2026  Lightpanda (Selecy SAS)
 //
 // Francis Bouvier <francis@lightpanda.io>
 // Pierre Tachoire <pierre@lightpanda.io>
@@ -19,6 +19,7 @@
 const std = @import("std");
 const lp = @import("lightpanda");
 const js = @import("../../js/js.zig");
+const html5ever = @import("../../parser/html5ever.zig");
 
 const Page = @import("../../Page.zig");
 const Session = @import("../../Session.zig");
@@ -30,13 +31,11 @@ _rc: lp.RC(u8) = .{},
 _fatal: bool,
 _arena: Allocator,
 _ignore_bom: bool,
-_stream: std.ArrayList(u8),
-
-const Label = enum {
-    utf8,
-    @"utf-8",
-    @"unicode-1-1-utf-8",
-};
+_bom_seen: bool,
+_decoder: ?*anyopaque, // Persistent streaming decoder
+_encoding_handle: *anyopaque,
+_encoding_name: []const u8,
+_lowercase_name: []const u8, // Cached lowercase version of encoding name
 
 const InitOpts = struct {
     fatal: bool = false,
@@ -44,8 +43,17 @@ const InitOpts = struct {
 };
 
 pub fn init(label_: ?[]const u8, opts_: ?InitOpts, page: *Page) !*TextDecoder {
-    if (label_) |label| {
-        _ = std.meta.stringToEnum(Label, label) orelse return error.RangeError;
+    const label = label_ orelse "utf-8";
+
+    const info = html5ever.encoding_for_label(label.ptr, label.len);
+    if (!info.isValid()) {
+        return error.RangeError;
+    }
+
+    // Check for "replacement" encoding - it's not usable for decoding per spec
+    const enc_name = info.name();
+    if (std.mem.eql(u8, enc_name, "replacement")) {
+        return error.RangeError;
     }
 
     const arena = try page.getArena(.{ .debug = "TextDecoder" });
@@ -55,14 +63,21 @@ pub fn init(label_: ?[]const u8, opts_: ?InitOpts, page: *Page) !*TextDecoder {
     const self = try arena.create(TextDecoder);
     self.* = .{
         ._arena = arena,
-        ._stream = .empty,
         ._fatal = opts.fatal,
         ._ignore_bom = opts.ignoreBOM,
+        ._encoding_handle = info.handle.?,
+        ._decoder = null,
+        ._bom_seen = false,
+        ._lowercase_name = "", // Will be lazily allocated
+        ._encoding_name = enc_name, // Points to static Rust memory
     };
     return self;
 }
 
 pub fn deinit(self: *TextDecoder, session: *Session) void {
+    if (self._decoder) |decoder| {
+        html5ever.encoding_decoder_free(decoder);
+    }
     session.releaseArena(self._arena);
 }
 
@@ -82,34 +97,110 @@ pub fn getFatal(self: *const TextDecoder) bool {
     return self._fatal;
 }
 
+pub fn getEncoding(self: *TextDecoder) ![]const u8 {
+    // Spec requires lowercase encoding name
+    // Allocate buffer for lowercase name on first access
+    if (self._lowercase_name.len > 0) {
+        return self._lowercase_name;
+    }
+    self._lowercase_name = try std.ascii.allocLowerString(self._arena, self._encoding_name);
+    return self._lowercase_name;
+}
+
 const DecodeOpts = struct {
     stream: bool = false,
 };
+
 pub fn decode(self: *TextDecoder, input_: ?[]const u8, opts_: ?DecodeOpts) ![]const u8 {
-    var input = input_ orelse return "";
     const opts: DecodeOpts = opts_ orelse .{};
+    const input = input_ orelse "";
 
-    if (self._stream.items.len > 0) {
-        try self._stream.appendSlice(self._arena, input);
-        input = self._stream.items;
-    }
-
-    if (self._fatal and !std.unicode.utf8ValidateSlice(input)) {
-        if (opts.stream) {
-            if (self._stream.items.len == 0) {
-                try self._stream.appendSlice(self._arena, input);
-            }
-            return "";
+    // For non-streaming calls, we don't need a persistent decoder
+    if (!opts.stream) {
+        // Reset decoder state if we had one
+        if (self._decoder) |decoder| {
+            html5ever.encoding_decoder_free(decoder);
+            self._decoder = null;
+        }
+    } else if (self._decoder == null) {
+        self._decoder = html5ever.encoding_decoder_new(self._encoding_handle);
+        if (self._decoder == null) {
+            return error.OutOfMemory;
         }
-        return error.InvalidUtf8;
     }
 
-    self._stream.clearRetainingCapacity();
-    if (self._ignore_bom == false and std.mem.startsWith(u8, input, &.{ 0xEF, 0xBB, 0xBF })) {
-        return input[3..];
+    return self._decode(input, self._decoder);
+}
+
+fn _decode(self: *TextDecoder, input: []const u8, streaming_decoder: ?*anyopaque) ![]const u8 {
+    if (input.len == 0) {
+        return "";
     }
 
-    return input;
+    // Calculate max output size
+    const max_out = html5ever.encoding_max_utf8_buffer_length(
+        self._encoding_handle,
+        input.len,
+    );
+
+    if (max_out == 0) {
+        return "";
+    }
+
+    // Allocate output buffer
+    const output = try self._arena.alloc(u8, max_out);
+
+    // Decode using either streaming or one-shot decoder
+    const result = if (streaming_decoder) |decoder|
+        html5ever.encoding_decoder_decode(
+            decoder,
+            input.ptr,
+            input.len,
+            output.ptr,
+            output.len,
+            0, // is_last = false for streaming
+        )
+    else
+        html5ever.encoding_decode(
+            self._encoding_handle,
+            input.ptr,
+            input.len,
+            output.ptr,
+            output.len,
+            1, // is_last = true for one-shot
+        );
+
+    // Handle errors in fatal mode
+    if (self._fatal and result.hadErrors()) {
+        if (streaming_decoder != null) {
+            // Reset decoder on error
+            if (self._decoder) |decoder| {
+                html5ever.encoding_decoder_free(decoder);
+                self._decoder = null;
+            }
+        }
+        self._bom_seen = false;
+        return error.TypeError;
+    }
+
+    var decoded: []const u8 = output[0..result.bytes_written];
+
+    // Handle BOM stripping
+    if (!self._bom_seen and !self._ignore_bom) {
+        decoded = stripBom(decoded);
+        self._bom_seen = true;
+    }
+
+    return decoded;
+}
+
+fn stripBom(data: []const u8) []const u8 {
+    // UTF-8 BOM in decoded output appears as U+FEFF (EF BB BF in UTF-8)
+    const bom = "\u{FEFF}";
+    if (std.mem.startsWith(u8, data, bom)) {
+        return data[bom.len..];
+    }
+    return data;
 }
 
 pub const JsApi = struct {
@@ -123,7 +214,7 @@ pub const JsApi = struct {
 
     pub const constructor = bridge.constructor(TextDecoder.init, .{});
     pub const decode = bridge.function(TextDecoder.decode, .{});
-    pub const encoding = bridge.property("utf-8", .{ .template = false });
+    pub const encoding = bridge.accessor(TextDecoder.getEncoding, null, .{});
     pub const fatal = bridge.accessor(TextDecoder.getFatal, null, .{});
     pub const ignoreBOM = bridge.accessor(TextDecoder.getIgnoreBOM, null, .{});
 };
diff --git a/src/browser/webapi/encoding/TextEncoder.zig b/src/browser/webapi/encoding/TextEncoder.zig
index a6bff48e..112d2e32 100644
--- a/src/browser/webapi/encoding/TextEncoder.zig
+++ b/src/browser/webapi/encoding/TextEncoder.zig
@@ -1,4 +1,4 @@
-// Copyright (C) 2023-2025  Lightpanda (Selecy SAS)
+// Copyright (C) 2023-2026  Lightpanda (Selecy SAS)
 //
 // Francis Bouvier <francis@lightpanda.io>
 // Pierre Tachoire <pierre@lightpanda.io>
@@ -26,12 +26,23 @@ pub fn init() TextEncoder {
     return .{};
 }
 
-pub fn encode(_: *const TextEncoder, v: []const u8) !js.TypedArray(u8) {
-    if (!std.unicode.utf8ValidateSlice(v)) {
+pub fn encode(_: *const TextEncoder, v_: ?js.Value) !js.TypedArray(u8) {
+    const v = v_ orelse return .{ .values = "" };
+
+    if (v.isUndefined()) {
+        return .{ .values = "" };
+    }
+
+    if (v.isNull()) {
+        return .{ .values = "null" };
+    }
+
+    const str = try v.toStringSlice();
+    if (!std.unicode.utf8ValidateSlice(str)) {
         return error.InvalidUtf8;
     }
 
-    return .{ .values = v };
+    return .{ .values = str };
 }
 
 pub const JsApi = struct {
diff --git a/src/html5ever/lib.rs b/src/html5ever/lib.rs
index 6fab9763..c684c039 100644
--- a/src/html5ever/lib.rs
+++ b/src/html5ever/lib.rs
@@ -151,6 +151,189 @@ pub extern "C" fn html5ever_parse_document_with_encoding(
         .one(StrTendril::from(decoded.as_ref()));
 }
 
+// === Encoding API for TextDecoder ===
+
+/// Result of encoding label lookup
+#[repr(C)]
+pub struct EncodingInfo {
+    /// 0 = not found, 1 = found
+    pub found: u8,
+    /// Opaque handle to the encoding (actually &'static Encoding)
+    pub handle: *const c_void,
+    /// Length of canonical name
+    pub name_len: usize,
+    /// Pointer to canonical encoding name (static, lowercase)
+    pub name_ptr: *const c_uchar,
+}
+
+/// Look up an encoding by its label (case-insensitive, whitespace-trimmed)
+#[no_mangle]
+pub extern "C" fn encoding_for_label(
+    label: *const c_uchar,
+    label_len: usize,
+) -> EncodingInfo {
+    if label.is_null() || label_len == 0 {
+        return EncodingInfo {
+            found: 0,
+            name_len: 0,
+            handle: std::ptr::null(),
+            name_ptr: std::ptr::null(),
+        };
+    }
+
+    let label_bytes = unsafe { std::slice::from_raw_parts(label, label_len) };
+
+    match Encoding::for_label(label_bytes) {
+        Some(encoding) => {
+            let name = encoding.name();
+            EncodingInfo {
+                found: 1,
+                name_len: name.len(),
+                name_ptr: name.as_ptr(),
+                handle: encoding as *const _ as *const c_void,
+            }
+        }
+        None => EncodingInfo {
+            found: 0,
+            name_len: 0,
+            name_ptr: std::ptr::null(),
+            handle: std::ptr::null(),
+        },
+    }
+}
+
+/// Calculate maximum UTF-8 buffer size needed for decoding
+#[no_mangle]
+pub extern "C" fn encoding_max_utf8_buffer_length(
+    handle: *const c_void,
+    input_len: usize,
+) -> usize {
+    if handle.is_null() {
+        return 0;
+    }
+    let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
+    let decoder = encoding.new_decoder();
+    decoder.max_utf8_buffer_length(input_len).unwrap_or(0)
+}
+
+/// Result of decoding operation
+#[repr(C)]
+pub struct DecodeResult {
+    /// 0 = no errors, 1 = had malformed sequences (replaced with U+FFFD)
+    pub had_errors: u8,
+    /// Number of input bytes consumed
+    pub bytes_read: usize,
+    /// Number of UTF-8 bytes written to output buffer
+    pub bytes_written: usize,
+}
+
+/// Decode bytes from source encoding to UTF-8
+/// For streaming, set is_last=0; for final/complete decode, set is_last=1
+#[no_mangle]
+pub extern "C" fn encoding_decode(
+    handle: *const c_void,
+    input: *const c_uchar,
+    input_len: usize,
+    output: *mut c_uchar,
+    output_len: usize,
+    is_last: u8,
+) -> DecodeResult {
+    if handle.is_null() || output.is_null() {
+        return DecodeResult {
+            had_errors: 1,
+            bytes_read: 0,
+            bytes_written: 0,
+        };
+    }
+
+    let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
+    let input_bytes = if input.is_null() || input_len == 0 {
+        &[]
+    } else {
+        unsafe { std::slice::from_raw_parts(input, input_len) }
+    };
+    let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_len) };
+
+    let mut decoder = encoding.new_decoder();
+    let last = is_last != 0;
+
+    let (result, bytes_read, bytes_written, had_errors) =
+        decoder.decode_to_utf8(input_bytes, output_slice, last);
+
+    // If output buffer was too small, we still report what we could process
+    let _ = result; // CoderResult::InputEmpty or CoderResult::OutputFull
+
+    DecodeResult {
+        had_errors: if had_errors { 1 } else { 0 },
+        bytes_read,
+        bytes_written,
+    }
+}
+
+// === Streaming Decoder API ===
+
+use encoding_rs::Decoder;
+
+/// Create a streaming decoder that maintains state across calls
+#[no_mangle]
+pub extern "C" fn encoding_decoder_new(handle: *const c_void) -> *mut c_void {
+    if handle.is_null() {
+        return std::ptr::null_mut();
+    }
+    let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
+    let decoder = Box::new(encoding.new_decoder());
+    Box::into_raw(decoder) as *mut c_void
+}
+
+/// Decode using a streaming decoder (maintains state for incomplete sequences)
+#[no_mangle]
+pub extern "C" fn encoding_decoder_decode(
+    decoder_ptr: *mut c_void,
+    input: *const c_uchar,
+    input_len: usize,
+    output: *mut c_uchar,
+    output_len: usize,
+    is_last: u8,
+) -> DecodeResult {
+    if decoder_ptr.is_null() || output.is_null() {
+        return DecodeResult {
+            had_errors: 1,
+            bytes_read: 0,
+            bytes_written: 0,
+        };
+    }
+
+    let decoder: &mut Decoder = unsafe { &mut *(decoder_ptr as *mut Decoder) };
+    let input_bytes = if input.is_null() || input_len == 0 {
+        &[]
+    } else {
+        unsafe { std::slice::from_raw_parts(input, input_len) }
+    };
+    let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_len) };
+
+    let last = is_last != 0;
+    let (result, bytes_read, bytes_written, had_errors) =
+        decoder.decode_to_utf8(input_bytes, output_slice, last);
+
+    let _ = result;
+
+    DecodeResult {
+        had_errors: if had_errors { 1 } else { 0 },
+        bytes_read,
+        bytes_written,
+    }
+}
+
+/// Free a streaming decoder
+#[no_mangle]
+pub extern "C" fn encoding_decoder_free(decoder_ptr: *mut c_void) {
+    if !decoder_ptr.is_null() {
+        unsafe {
+            drop(Box::from_raw(decoder_ptr as *mut Decoder));
+        }
+    }
+}
+
 #[no_mangle]
 pub extern "C" fn html5ever_parse_fragment(
     html: *mut c_uchar,

From f7c1710c2354d19a1625b2ae969cc36bb9f578a4 Mon Sep 17 00:00:00 2001
From: Karl Seguin <k@openmymind.io>
Date: Fri, 10 Apr 2026 12:02:06 +0800
Subject: [PATCH 2/5] Expose correct charset

document.characterSet, document.charset and document.inputEncoding now exposes
the correct charset.
---
 src/browser/Page.zig                     | 41 +++++++++++-------------
 src/browser/tests/document/document.html |  4 +++
 src/browser/tests/page/encoding.html     |  4 +++
 src/browser/webapi/Document.zig          | 11 +++++--
 4 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/src/browser/Page.zig b/src/browser/Page.zig
index f12b606b..7c66faff 100644
--- a/src/browser/Page.zig
+++ b/src/browser/Page.zig
@@ -207,6 +207,9 @@ base_url: ?[:0]const u8 = null,
 // referer header cache.
 referer_header: ?[:0]const u8 = null,
 
+// Document charset (canonical name from encoding_rs, static lifetime)
+charset: []const u8 = "UTF-8",
+
 // Arbitrary buffer. Need to temporarily lowercase a value? Use this. No lifetime
 // guarantee - it's valid until someone else uses it.
 buf: [BUF_SIZE]u8 = undefined,
@@ -962,9 +965,13 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
 
         switch (mime.content_type) {
             .text_html => {
-                self._parse_state = .{ .html = .{
-                    .mime = mime,
-                } };
+                // Normalize and store the charset using encoding_rs canonical names
+                const charset_str = mime.charsetString();
+                const info = h5e.encoding_for_label(charset_str.ptr, charset_str.len);
+                if (info.isValid()) {
+                    self.charset = info.name();
+                }
+                self._parse_state = .{ .html = .empty };
             },
             .application_json, .text_javascript, .text_css, .text_plain => {
                 var arr: std.ArrayList(u8) = .empty;
@@ -979,7 +986,7 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
     }
 
     switch (self._parse_state) {
-        .html => |*html| try html.buf.appendSlice(self.arena, data),
+        .html => |*html| try html.appendSlice(self.arena, data),
         .text => |*buf| {
             // we have to escape the data...
             var v = data;
@@ -1028,12 +1035,13 @@ fn pageDoneCallback(ctx: *anyopaque) !void {
     var parser = Parser.init(parse_arena, self.document.asNode(), self);
 
     switch (self._parse_state) {
-        .html => |*html_state| {
-            const raw_html = html_state.buf.items;
-            if (html_state.needsEncodingConversion()) {
-                parser.parseWithEncoding(raw_html, html_state.mime.charsetString());
-            } else {
+        .html => |*html_buf| {
+            const raw_html = html_buf.items;
+
+            if (std.mem.eql(u8, self.charset, "UTF-8")) {
                 parser.parse(raw_html);
+            } else {
+                parser.parseWithEncoding(raw_html, self.charset);
             }
             self._script_manager.staticScriptsDone();
             self._parse_state = .complete;
@@ -3164,21 +3172,11 @@ const ParseState = union(enum) {
     pre,
     complete,
     err: anyerror,
-    html: Html,
+    html: std.ArrayList(u8),
     text: std.ArrayList(u8),
     image: std.ArrayList(u8),
     raw: std.ArrayList(u8),
     raw_done: []const u8,
-
-    const Html = struct {
-        mime: Mime,
-        buf: std.ArrayList(u8) = .empty,
-
-        fn needsEncodingConversion(self: *const Html) bool {
-            const charset = self.mime.charsetString();
-            return !std.ascii.eqlIgnoreCase(charset, "utf-8") and !std.ascii.eqlIgnoreCase(charset, "utf8");
-        }
-    };
 };
 
 const LoadState = enum {
@@ -3628,9 +3626,6 @@ fn asUint(comptime string: anytype) std.meta.Int(
 
 const testing = @import("../testing.zig");
 test "WebApi: Page" {
-    const filter: testing.LogFilter = .init(&.{ .http, .js });
-    defer filter.deinit();
-
     try testing.htmlRunner("page", .{});
 }
 
diff --git a/src/browser/tests/document/document.html b/src/browser/tests/document/document.html
index 74d8ff30..ede2b507 100644
--- a/src/browser/tests/document/document.html
+++ b/src/browser/tests/document/document.html
@@ -18,6 +18,10 @@
   testing.expectEqual("visible", document.visibilityState);
   testing.expectEqual(false, document.prerendering);
   testing.expectEqual(undefined, Document.prerendering);
+  // characterSet should return canonical encoding name
+  testing.expectEqual("UTF-8", document.characterSet);
+  testing.expectEqual("UTF-8", document.charset);
+  testing.expectEqual("UTF-8", document.inputEncoding);
 </script>
 
 <script id=headAndbody>
diff --git a/src/browser/tests/page/encoding.html b/src/browser/tests/page/encoding.html
index af532b82..19e0134f 100644
--- a/src/browser/tests/page/encoding.html
+++ b/src/browser/tests/page/encoding.html
@@ -11,6 +11,10 @@
     testing.onload(() => {
       // GBK-encoded "中文" should be decoded to UTF-8
       testing.expectEqual('中文', iframe.contentDocument.getElementById('test').textContent);
+      // document.characterSet should return canonical encoding name
+      testing.expectEqual('GBK', iframe.contentDocument.characterSet);
+      testing.expectEqual('GBK', iframe.contentDocument.charset);
+      testing.expectEqual('GBK', iframe.contentDocument.inputEncoding);
     });
   }
 </script>
diff --git a/src/browser/webapi/Document.zig b/src/browser/webapi/Document.zig
index cf15c49b..6b48e4c3 100644
--- a/src/browser/webapi/Document.zig
+++ b/src/browser/webapi/Document.zig
@@ -1068,10 +1068,15 @@ pub const JsApi = struct {
     pub const hasFocus = bridge.function(Document.hasFocus, .{});
 
     pub const prerendering = bridge.property(false, .{ .template = false });
-    pub const characterSet = bridge.property("UTF-8", .{ .template = false });
-    pub const charset = bridge.property("UTF-8", .{ .template = false });
-    pub const inputEncoding = bridge.property("UTF-8", .{ .template = false });
+    pub const characterSet = bridge.accessor(getCharacterSet, null, .{});
+    pub const charset = bridge.accessor(getCharacterSet, null, .{});
+    pub const inputEncoding = bridge.accessor(getCharacterSet, null, .{});
     pub const compatMode = bridge.property("CSS1Compat", .{ .template = false });
+
+    fn getCharacterSet(self: *const Document) []const u8 {
+        const doc_page = self._page orelse return "UTF-8";
+        return doc_page.charset;
+    }
     pub const referrer = bridge.property("", .{ .template = false });
 };
 

From 05229fdc536645f9f21e2f40d5d2dfa2c5ed46e3 Mon Sep 17 00:00:00 2001
From: Karl Seguin <k@openmymind.io>
Date: Fri, 10 Apr 2026 14:41:19 +0800
Subject: [PATCH 3/5] Use the document's charset to determine if/how to encode
 querystring

Whenever we resolve a URL, say from `anchor.href`, we should consider the
document's charset when encoding the querystring. This probably isn't the
most important feature, but it makes tens of thousands of WPT cases pass, e.g

/encoding/legacy-mb-tchinese/big5/big5-encode-href-errors-han.html?3001-4000 and
/encoding/legacy-mb-japanese/euc-jp/eucjp-encode-href-errors-han.html?17001-18000

DOM elements previous called `URL.resolveURL(...)`. They now call
`self.asNode().resolveURL(...)`, where `Node#resolveURL` will provide the
document's charset.
---
 src/SemanticTree.zig                       |   2 +-
 src/browser/Page.zig                       |   4 +-
 src/browser/URL.zig                        |  91 ++++++++++++----
 src/browser/interactive.zig                |   2 +-
 src/browser/markdown.zig                   |   6 +-
 src/browser/parser/html5ever.zig           |  24 +++++
 src/browser/structured_data.zig            |   2 +-
 src/browser/tests/page/encoding.html       |  29 ++++++
 src/browser/webapi/Node.zig                |  13 +++
 src/browser/webapi/element/html/Anchor.zig |   7 +-
 src/browser/webapi/element/html/Form.zig   |   2 +-
 src/browser/webapi/element/html/IFrame.zig |   4 +-
 src/browser/webapi/element/html/Image.zig  |   4 +-
 src/browser/webapi/element/html/Link.zig   |   4 +-
 src/browser/webapi/element/html/Media.zig  |   3 +-
 src/browser/webapi/element/html/Script.zig |   4 +-
 src/browser/webapi/element/html/Video.zig  |   4 +-
 src/browser/webapi/net/WebSocket.zig       |   2 +-
 src/browser/webapi/net/XMLHttpRequest.zig  |   2 +-
 src/cdp/domains/page.zig                   |   2 +-
 src/cdp/domains/target.zig                 |   2 +-
 src/html5ever/lib.rs                       | 114 +++++++++++++++++++++
 src/lightpanda.zig                         |   2 +-
 23 files changed, 276 insertions(+), 53 deletions(-)

diff --git a/src/SemanticTree.zig b/src/SemanticTree.zig
index 9bca520e..5b3f0ae5 100644
--- a/src/SemanticTree.zig
+++ b/src/SemanticTree.zig
@@ -671,7 +671,7 @@ pub fn getNodeDetails(
 
         if (el.getAttributeSafe(comptime .wrap("href"))) |h| {
             const URL = lp.URL;
-            href = URL.resolve(arena, page.base(), h, .{ .encode = true }) catch h;
+            href = URL.resolve(arena, page.base(), h, .{ .encoding = page.charset }) catch h;
         }
 
         if (el.is(Element.Html.Input)) |input| {
diff --git a/src/browser/Page.zig b/src/browser/Page.zig
index 7c66faff..1c3d39f0 100644
--- a/src/browser/Page.zig
+++ b/src/browser/Page.zig
@@ -661,7 +661,7 @@ fn scheduleNavigationWithArena(originator: *Page, arena: Allocator, request_url:
             arena,
             page_base,
             request_url,
-            .{ .always_dupe = true, .encode = true },
+            .{ .always_dupe = true, .encoding = originator.charset },
         );
         break :blk .{ u, false };
     };
@@ -1196,7 +1196,7 @@ pub fn iframeAddedCallback(self: *Page, iframe: *IFrame) !void {
             self.call_arena, // ok to use, page.navigate dupes this
             self.base(),
             src,
-            .{ .encode = true },
+            .{ .encoding = self.charset },
         );
     };
 
diff --git a/src/browser/URL.zig b/src/browser/URL.zig
index 6f8cbebd..532f11a1 100644
--- a/src/browser/URL.zig
+++ b/src/browser/URL.zig
@@ -19,16 +19,19 @@
 const std = @import("std");
 const Allocator = std.mem.Allocator;
 
-const ResolveOpts = struct {
-    encode: bool = false,
+pub const ResolveOpts = struct {
+    /// null = don't encode, "UTF-8" = standard percent encoding,
+    /// other charset = encode query string using that charset with NCR fallback
+    encoding: ?[]const u8 = null,
     always_dupe: bool = false,
 };
 
 // path is anytype, so that it can be used with both []const u8 and [:0]const u8
-pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, comptime opts: ResolveOpts) ![:0]const u8 {
+pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, opts: ResolveOpts) ![:0]const u8 {
     const PT = @TypeOf(source_path);
 
-    var path: [:0]const u8 = if (comptime !isNullTerminated(PT) or opts.always_dupe) try allocator.dupeZ(u8, source_path) else source_path;
+    const needs_dupe = comptime !isNullTerminated(PT);
+    var path: [:0]const u8 = if (needs_dupe or opts.always_dupe) try allocator.dupeZ(u8, source_path) else source_path;
 
     if (base.len == 0) {
         return processResolved(allocator, path, opts);
@@ -186,14 +189,12 @@ pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, c
     return processResolved(allocator, out[0..out_i :0], opts);
 }
 
-fn processResolved(allocator: Allocator, url: [:0]const u8, comptime opts: ResolveOpts) ![:0]const u8 {
-    if (!comptime opts.encode) {
-        return url;
-    }
-    return ensureEncoded(allocator, url);
+fn processResolved(allocator: Allocator, url: [:0]const u8, opts: ResolveOpts) ![:0]const u8 {
+    const encoding = opts.encoding orelse return url;
+    return ensureEncoded(allocator, url, encoding);
 }
 
-pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
+pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8, encoding: []const u8) ![:0]const u8 {
     const scheme_end = std.mem.indexOf(u8, url, "://");
     const authority_start = if (scheme_end) |end| end + 3 else 0;
     const path_start = std.mem.indexOfScalarPos(u8, url, authority_start, '/') orelse return url;
@@ -205,18 +206,18 @@ pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
     const query_end = if (query_start) |_| (fragment_start orelse url.len) else path_end;
 
     const path_to_encode = url[path_start..path_end];
+    // Path is always UTF-8 percent encoded per URL spec
     const encoded_path = try percentEncodeSegment(allocator, path_to_encode, .path);
 
+    // Query string uses document encoding
     const encoded_query = if (query_start) |qs| blk: {
         const query_to_encode = url[qs + 1 .. query_end];
-        const encoded = try percentEncodeSegment(allocator, query_to_encode, .query);
-        break :blk encoded;
+        break :blk try encodeQueryString(allocator, query_to_encode, encoding);
     } else null;
 
     const encoded_fragment = if (fragment_start) |fs| blk: {
         const fragment_to_encode = url[fs + 1 ..];
-        const encoded = try percentEncodeSegment(allocator, fragment_to_encode, .query);
-        break :blk encoded;
+        break :blk try percentEncodeSegment(allocator, fragment_to_encode, .query);
     } else null;
 
     if (encoded_path.ptr == path_to_encode.ptr and
@@ -242,7 +243,7 @@ pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
     return buf.items[0 .. buf.items.len - 1 :0];
 }
 
-const EncodeSet = enum { path, query, userinfo, fragment };
+const EncodeSet = enum { path, query, query_legacy, userinfo, fragment };
 
 fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime encode_set: EncodeSet) ![]const u8 {
     // Check if encoding is needed
@@ -285,17 +286,65 @@ fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime enco
     return buf.items;
 }
 
+const h5e = @import("parser/html5ever.zig");
+
+/// Encode a query string using the specified encoding.
+/// For UTF-8, this is standard percent encoding.
+/// For legacy encodings, unmappable characters are replaced with NCRs (&#codepoint;).
+fn encodeQueryString(allocator: Allocator, query: []const u8, encoding: []const u8) ![]const u8 {
+    // For UTF-8, use standard percent encoding
+    if (std.mem.eql(u8, encoding, "UTF-8")) {
+        return percentEncodeSegment(allocator, query, .query);
+    }
+
+    // For legacy encodings, first encode to the target charset with NCR fallback
+    const enc_info = h5e.encoding_for_label(encoding.ptr, encoding.len);
+    if (!enc_info.isValid()) {
+        // Unknown encoding, fall back to UTF-8
+        return percentEncodeSegment(allocator, query, .query);
+    }
+
+    // Calculate max buffer size for encoded output
+    const max_encoded_len = h5e.encoding_max_encode_buffer_length(enc_info.handle.?, query.len);
+    if (max_encoded_len == 0) {
+        return percentEncodeSegment(allocator, query, .query);
+    }
+
+    const encode_buf = try allocator.alloc(u8, max_encoded_len);
+    defer allocator.free(encode_buf);
+
+    // Encode UTF-8 to legacy encoding with NCR fallback
+    const result = h5e.encoding_encode_with_ncr(
+        enc_info.handle.?,
+        query.ptr,
+        query.len,
+        encode_buf.ptr,
+        encode_buf.len,
+    );
+
+    if (!result.isSuccess()) {
+        // Encoding failed, fall back to UTF-8
+        return percentEncodeSegment(allocator, query, .query);
+    }
+
+    // Now percent-encode the result using query_legacy to preserve NCRs
+    const encoded_bytes = encode_buf[0..result.bytes_written];
+    return percentEncodeSegment(allocator, encoded_bytes, .query_legacy);
+}
+
 fn shouldPercentEncode(c: u8, comptime encode_set: EncodeSet) bool {
     return switch (c) {
         // Unreserved characters (RFC 3986)
         'A'...'Z', 'a'...'z', '0'...'9', '-', '.', '_', '~' => false,
-        // sub-delims allowed in path/query but some must be encoded in userinfo
-        '!', '$', '&', '\'', '(', ')', '*', '+', ',' => false,
-        ';', '=' => encode_set == .userinfo,
+        // sub-delims allowed in path/query but some must be encoded in userinfo/query_legacy
+        '!', '$', '\'', '(', ')', '*', '+', ',' => false,
+        // '&' and ';' must be encoded for legacy encoding (to preserve NCRs like &#nnnnn;)
+        '&', ';' => encode_set == .userinfo or encode_set == .query_legacy,
+        '=' => encode_set == .userinfo,
         // Separators: userinfo must encode these
         '/', ':', '@' => encode_set == .userinfo,
         // '?' is allowed in queries only
-        '?' => encode_set != .query,
+        '?' => encode_set != .query and encode_set != .query_legacy,
         // '#' is allowed in fragments only
         '#' => encode_set != .fragment,
         // Everything else needs encoding (including space)
@@ -1130,7 +1179,7 @@ test "URL: ensureEncoded" {
     };
 
     for (cases) |case| {
-        const result = try ensureEncoded(testing.arena_allocator, case.url);
+        const result = try ensureEncoded(testing.arena_allocator, case.url, "UTF-8");
         try testing.expectString(case.expected, result);
     }
 }
@@ -1296,7 +1345,7 @@ test "URL: resolve with encoding" {
     };
 
     for (cases) |case| {
-        const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encode = true });
+        const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encoding = "UTF-8" });
         try testing.expectString(case.expected, result);
     }
 }
diff --git a/src/browser/interactive.zig b/src/browser/interactive.zig
index a0b4528a..225633c7 100644
--- a/src/browser/interactive.zig
+++ b/src/browser/interactive.zig
@@ -182,7 +182,7 @@ pub fn collectInteractiveElements(
             .id = el.getAttributeSafe(comptime .wrap("id")),
             .class = el.getAttributeSafe(comptime .wrap("class")),
             .href = if (el.getAttributeSafe(comptime .wrap("href"))) |href|
-                URL.resolve(arena, page.base(), href, .{ .encode = true }) catch href
+                URL.resolve(arena, page.base(), href, .{ .encoding = page.charset }) catch href
             else
                 null,
             .input_type = getInputType(el),
diff --git a/src/browser/markdown.zig b/src/browser/markdown.zig
index 5a83dfdc..437dbee6 100644
--- a/src/browser/markdown.zig
+++ b/src/browser/markdown.zig
@@ -278,7 +278,8 @@ const Context = struct {
                 }
                 try self.writer.writeAll("](");
                 if (el.getAttributeSafe(comptime .wrap("src"))) |src| {
-                    const absolute_src = URL.resolve(self.page.call_arena, self.page.base(), src, .{ .encode = true }) catch src;
+                    const page = self.page;
+                    const absolute_src = URL.resolve(page.call_arena, page.base(), src, .{ .encoding = page.charset }) catch src;
                     try self.writer.writeAll(absolute_src);
                 }
                 try self.writer.writeAll(")");
@@ -286,13 +287,14 @@ const Context = struct {
                 return;
             },
             .anchor => {
+                const page = self.page;
                 const info = analyzeContent(el.asNode());
                 const label = getAnchorLabel(el);
                 const href_raw = el.getAttributeSafe(comptime .wrap("href"));
 
                 if (!info.has_visible and label == null and href_raw == null) return;
 
-                const href = if (href_raw) |h| URL.resolve(self.page.call_arena, self.page.base(), h, .{ .encode = true }) catch h else null;
+                const href = if (href_raw) |h| URL.resolve(page.call_arena, page.base(), h, .{ .encoding = page.charset }) catch h else null;
 
                 if (info.has_block) {
                     try self.renderChildren(el.asNode());
diff --git a/src/browser/parser/html5ever.zig b/src/browser/parser/html5ever.zig
index cb673789..829ac429 100644
--- a/src/browser/parser/html5ever.zig
+++ b/src/browser/parser/html5ever.zig
@@ -278,3 +278,27 @@ pub extern "c" fn encoding_decoder_decode(
 ) DecodeResult;
 
 pub extern "c" fn encoding_decoder_free(decoder: *anyopaque) void;
+
+// Encoding API (UTF-8 to legacy encoding with NCR fallback)
+pub const EncodeResult = extern struct {
+    status: u8,
+    bytes_read: usize,
+    bytes_written: usize,
+
+    pub fn isSuccess(self: *const EncodeResult) bool {
+        return self.status == 0;
+    }
+};
+
+pub extern "c" fn encoding_encode_with_ncr(
+    handle: *anyopaque,
+    input: ?[*]const u8,
+    input_len: usize,
+    output: [*]u8,
+    output_capacity: usize,
+) EncodeResult;
+
+pub extern "c" fn encoding_max_encode_buffer_length(
+    handle: *anyopaque,
+    input_len: usize,
+) usize;
diff --git a/src/browser/structured_data.zig b/src/browser/structured_data.zig
index 9b6e7fbe..cad1d9d8 100644
--- a/src/browser/structured_data.zig
+++ b/src/browser/structured_data.zig
@@ -288,7 +288,7 @@ fn collectLink(
 ) !void {
     const rel = el.getAttributeSafe(comptime .wrap("rel")) orelse return;
     const raw_href = el.getAttributeSafe(comptime .wrap("href")) orelse return;
-    const href = URL.resolve(arena, page.base(), raw_href, .{ .encode = true }) catch raw_href;
+    const href = URL.resolve(arena, page.base(), raw_href, .{ .encoding = page.charset }) catch raw_href;
 
     if (std.ascii.eqlIgnoreCase(rel, "alternate")) {
         try alternate.append(arena, .{
diff --git a/src/browser/tests/page/encoding.html b/src/browser/tests/page/encoding.html
index 19e0134f..b740a465 100644
--- a/src/browser/tests/page/encoding.html
+++ b/src/browser/tests/page/encoding.html
@@ -77,3 +77,32 @@
     });
   }
 </script>
+
+<script id="anchor_href_encoding_with_ncr">
+  {
+    // Test that anchor.href encodes unmappable characters as NCRs in non-UTF-8 documents.
+    // When a character can't be represented in the document's encoding, it should become &#nnnnn;
+    // Per WHATWG URL Standard, query strings use document encoding with NCR fallback.
+    const iframe = document.createElement('iframe');
+    document.body.appendChild(iframe);
+    iframe.src = 'encoding/gbk.html';
+
+    testing.onload(() => {
+      testing.expectEqual('GBK', iframe.contentDocument.characterSet);
+
+      // Test 1: U+3D34 (㴴) - a Han character NOT in GBK, should become NCR &#15668;
+      const anchor = iframe.contentDocument.createElement('a');
+      iframe.contentDocument.body.appendChild(anchor);
+      anchor.href = 'http://example.com/?q=\u3D34';
+      // The NCR &#15668; percent-encoded is %26%2315668%3B
+      testing.expectEqual('http://example.com/?q=%26%2315668%3B', anchor.href);
+
+      // Test 2: U+4E2D (中) - IS in GBK, should encode to GBK bytes D6D0 then percent-encode
+      const anchor2 = iframe.contentDocument.createElement('a');
+      iframe.contentDocument.body.appendChild(anchor2);
+      anchor2.href = 'http://example.com/?q=\u4E2D';
+      // GBK encoding of 中 is D6 D0, percent-encoded as %D6%D0
+      testing.expectEqual('http://example.com/?q=%D6%D0', anchor2.href);
+    });
+  }
+</script>
diff --git a/src/browser/webapi/Node.zig b/src/browser/webapi/Node.zig
index 0e7c2ffe..5871abee 100644
--- a/src/browser/webapi/Node.zig
+++ b/src/browser/webapi/Node.zig
@@ -22,6 +22,7 @@ const String = @import("../../string.zig").String;
 
 const js = @import("../js/js.zig");
 const Page = @import("../Page.zig");
+const URL = @import("../URL.zig");
 const reflect = @import("../reflect.zig");
 
 const EventTarget = @import("EventTarget.zig");
@@ -511,6 +512,18 @@ pub fn ownerPage(self: *const Node, default: *Page) *Page {
     return doc._page orelse default;
 }
 
+pub const ResolveURLOpts = struct {
+    allocator: ?Allocator = null,
+};
+
+// Resolve a URL relative to this node's owning document.
+// Uses the document's charset for query string encoding (with NCR fallback for unmappable chars).
+pub fn resolveURL(self: *const Node, url: anytype, page: *Page, opts: ResolveURLOpts) ![:0]const u8 {
+    const owner_page = self.ownerPage(page);
+    const allocator = opts.allocator orelse page.call_arena;
+    return URL.resolve(allocator, owner_page.base(), url, .{ .encoding = owner_page.charset });
+}
+
 pub fn isSameDocumentAs(self: *const Node, other: *const Node, page: *const Page) bool {
     // Get the root document for each node
     const self_doc = if (self._type == .document) self._type.document else self.ownerDocument(page);
diff --git a/src/browser/webapi/element/html/Anchor.zig b/src/browser/webapi/element/html/Anchor.zig
index 33c8bded..e4207e84 100644
--- a/src/browser/webapi/element/html/Anchor.zig
+++ b/src/browser/webapi/element/html/Anchor.zig
@@ -39,12 +39,11 @@ pub fn asNode(self: *Anchor) *Node {
 }
 
 pub fn getHref(self: *Anchor, page: *Page) ![]const u8 {
-    const element = self.asElement();
-    const href = element.getAttributeSafe(comptime .wrap("href")) orelse return "";
+    const href = self.asElement().getAttributeSafe(comptime .wrap("href")) orelse return "";
     if (href.len == 0) {
         return "";
     }
-    return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true });
+    return self.asNode().resolveURL(href, page, .{});
 }
 
 pub fn setHref(self: *Anchor, value: []const u8, page: *Page) !void {
@@ -203,7 +202,7 @@ fn getResolvedHref(self: *Anchor, page: *Page) !?[:0]const u8 {
     if (href.len == 0) {
         return null;
     }
-    return try URL.resolve(page.call_arena, page.base(), href, .{});
+    return try self.asNode().resolveURL(href, page, .{});
 }
 
 pub const JsApi = struct {
diff --git a/src/browser/webapi/element/html/Form.zig b/src/browser/webapi/element/html/Form.zig
index e8857e48..6628306b 100644
--- a/src/browser/webapi/element/html/Form.zig
+++ b/src/browser/webapi/element/html/Form.zig
@@ -97,7 +97,7 @@ pub fn getAction(self: *Form, page: *Page) ![]const u8 {
     if (action.len == 0) {
         return page.url;
     }
-    return URL.resolve(page.call_arena, page.base(), action, .{ .encode = true });
+    return element.asNode().resolveURL(action, page, .{});
 }
 
 pub fn setAction(self: *Form, value: []const u8, page: *Page) !void {
diff --git a/src/browser/webapi/element/html/IFrame.zig b/src/browser/webapi/element/html/IFrame.zig
index e596f4ac..3b276dcd 100644
--- a/src/browser/webapi/element/html/IFrame.zig
+++ b/src/browser/webapi/element/html/IFrame.zig
@@ -48,9 +48,9 @@ pub fn getContentDocument(self: *const IFrame) ?*Document {
     return window._document;
 }
 
-pub fn getSrc(self: *const IFrame, page: *Page) ![:0]const u8 {
+pub fn getSrc(self: *IFrame, page: *Page) ![:0]const u8 {
     if (self._src.len == 0) return "";
-    return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true });
+    return self.asNode().resolveURL(self._src, page, .{});
 }
 
 pub fn setSrc(self: *IFrame, src: []const u8, page: *Page) !void {
diff --git a/src/browser/webapi/element/html/Image.zig b/src/browser/webapi/element/html/Image.zig
index b6731144..e3b57cd9 100644
--- a/src/browser/webapi/element/html/Image.zig
+++ b/src/browser/webapi/element/html/Image.zig
@@ -40,9 +40,7 @@ pub fn getSrc(self: *const Image, page: *Page) ![]const u8 {
     if (src.len == 0) {
         return "";
     }
-
-    // Always resolve the src against the page URL
-    return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true });
+    return element.asConstNode().resolveURL(src, page, .{});
 }
 
 pub fn setSrc(self: *Image, value: []const u8, page: *Page) !void {
diff --git a/src/browser/webapi/element/html/Link.zig b/src/browser/webapi/element/html/Link.zig
index ed3839f2..5b6ce0c6 100644
--- a/src/browser/webapi/element/html/Link.zig
+++ b/src/browser/webapi/element/html/Link.zig
@@ -44,9 +44,7 @@ pub fn getHref(self: *Link, page: *Page) ![]const u8 {
     if (href.len == 0) {
         return "";
     }
-
-    // Always resolve the href against the page URL
-    return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true });
+    return element.asNode().resolveURL(href, page, .{});
 }
 
 pub fn setHref(self: *Link, value: []const u8, page: *Page) !void {
diff --git a/src/browser/webapi/element/html/Media.zig b/src/browser/webapi/element/html/Media.zig
index 71013e71..6d62013f 100644
--- a/src/browser/webapi/element/html/Media.zig
+++ b/src/browser/webapi/element/html/Media.zig
@@ -235,8 +235,7 @@ pub fn getSrc(self: *const Media, page: *Page) ![]const u8 {
     if (src.len == 0) {
         return "";
     }
-    const URL = @import("../../URL.zig");
-    return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true });
+    return element.asConstNode().resolveURL(src, page, .{});
 }
 
 pub fn setSrc(self: *Media, value: []const u8, page: *Page) !void {
diff --git a/src/browser/webapi/element/html/Script.zig b/src/browser/webapi/element/html/Script.zig
index d5e83b4f..77b6b7ef 100644
--- a/src/browser/webapi/element/html/Script.zig
+++ b/src/browser/webapi/element/html/Script.zig
@@ -45,9 +45,9 @@ pub fn asNode(self: *Script) *Node {
     return self.asElement().asNode();
 }
 
-pub fn getSrc(self: *const Script, page: *Page) ![]const u8 {
+pub fn getSrc(self: *Script, page: *Page) ![]const u8 {
     if (self._src.len == 0) return "";
-    return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true });
+    return self.asNode().resolveURL(self._src, page, .{});
 }
 
 pub fn setSrc(self: *Script, src: []const u8, page: *Page) !void {
diff --git a/src/browser/webapi/element/html/Video.zig b/src/browser/webapi/element/html/Video.zig
index 63ccda4a..8fabb3ae 100644
--- a/src/browser/webapi/element/html/Video.zig
+++ b/src/browser/webapi/element/html/Video.zig
@@ -57,9 +57,7 @@ pub fn getPoster(self: *const Video, page: *Page) ![]const u8 {
     if (poster.len == 0) {
         return "";
     }
-
-    const URL = @import("../../URL.zig");
-    return URL.resolve(page.call_arena, page.base(), poster, .{ .encode = true });
+    return element.asConstNode().resolveURL(poster, page, .{});
 }
 
 pub fn setPoster(self: *Video, value: []const u8, page: *Page) !void {
diff --git a/src/browser/webapi/net/WebSocket.zig b/src/browser/webapi/net/WebSocket.zig
index 1244a61e..c5228627 100644
--- a/src/browser/webapi/net/WebSocket.zig
+++ b/src/browser/webapi/net/WebSocket.zig
@@ -108,7 +108,7 @@ pub fn init(url: []const u8, protocols_: ?[]const u8, page: *Page) !*WebSocket {
     const arena = try page.getArena(.{ .debug = "WebSocket" });
     errdefer page.releaseArena(arena);
 
-    const resolved_url = try URL.resolve(arena, page.base(), url, .{ .always_dupe = true, .encode = true });
+    const resolved_url = try URL.resolve(arena, page.base(), url, .{ .always_dupe = true, .encoding = page.charset });
 
     const http_client = page._session.browser.http_client;
     const conn = http_client.network.newConnection() orelse {
diff --git a/src/browser/webapi/net/XMLHttpRequest.zig b/src/browser/webapi/net/XMLHttpRequest.zig
index 62e05a17..8a56d370 100644
--- a/src/browser/webapi/net/XMLHttpRequest.zig
+++ b/src/browser/webapi/net/XMLHttpRequest.zig
@@ -210,7 +210,7 @@ pub fn open(self: *XMLHttpRequest, method_: []const u8, url: [:0]const u8) !void
 
     const page = self._page;
     self._method = try parseMethod(method_);
-    self._url = try URL.resolve(self._arena, page.base(), url, .{ .always_dupe = true, .encode = true });
+    self._url = try URL.resolve(self._arena, page.base(), url, .{ .always_dupe = true, .encoding = page.charset });
     try self.stateChanged(.opened, page);
 }
 
diff --git a/src/cdp/domains/page.zig b/src/cdp/domains/page.zig
index 267cada8..beb86c6b 100644
--- a/src/cdp/domains/page.zig
+++ b/src/cdp/domains/page.zig
@@ -286,7 +286,7 @@ fn navigate(cmd: *CDP.Command) !void {
         page = try session.replacePage();
     }
 
-    const encoded_url = try URL.ensureEncoded(page.call_arena, params.url);
+    const encoded_url = try URL.ensureEncoded(page.call_arena, params.url, "UTF-8");
     try page.navigate(encoded_url, .{
         .reason = .address_bar,
         .cdp_id = cmd.input.id,
diff --git a/src/cdp/domains/target.zig b/src/cdp/domains/target.zig
index bce7e00d..822659f7 100644
--- a/src/cdp/domains/target.zig
+++ b/src/cdp/domains/target.zig
@@ -220,7 +220,7 @@ fn createTarget(cmd: *CDP.Command) !void {
     }
 
     if (!std.mem.eql(u8, "about:blank", params.url)) {
-        const encoded_url = try URL.ensureEncoded(page.call_arena, params.url);
+        const encoded_url = try URL.ensureEncoded(page.call_arena, params.url, "UTF-8");
         try page.navigate(
             encoded_url,
             .{ .reason = .address_bar, .kind = .{ .push = null } },
diff --git a/src/html5ever/lib.rs b/src/html5ever/lib.rs
index c684c039..9d14e784 100644
--- a/src/html5ever/lib.rs
+++ b/src/html5ever/lib.rs
@@ -334,6 +334,120 @@ pub extern "C" fn encoding_decoder_free(decoder_ptr: *mut c_void) {
     }
 }
 
+// === Encoding API (UTF-8 to legacy encoding with NCR fallback) ===
+
+/// Result of encoding operation
+#[repr(C)]
+pub struct EncodeResult {
+    /// 0 = success, 1 = output buffer too small
+    pub status: u8,
+    /// Number of input bytes consumed
+    pub bytes_read: usize,
+    /// Number of bytes written to output buffer
+    pub bytes_written: usize,
+}
+
+/// Encode UTF-8 to a legacy encoding, replacing unencodable characters with
+/// HTML decimal numeric character references (&#codepoint;).
+///
+/// This is used for URL query string encoding per WHATWG URL spec.
+/// encoding_rs's encode_from_utf8 already produces NCRs for unmappable chars.
+#[no_mangle]
+pub extern "C" fn encoding_encode_with_ncr(
+    handle: *const c_void,
+    input: *const c_uchar,
+    input_len: usize,
+    output: *mut c_uchar,
+    output_capacity: usize,
+) -> EncodeResult {
+    if handle.is_null() || output.is_null() {
+        return EncodeResult {
+            status: 1,
+            bytes_read: 0,
+            bytes_written: 0,
+        };
+    }
+
+    let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
+
+    let input_str = if input.is_null() || input_len == 0 {
+        ""
+    } else {
+        let bytes = unsafe { std::slice::from_raw_parts(input, input_len) };
+        match std::str::from_utf8(bytes) {
+            Ok(s) => s,
+            Err(_) => {
+                return EncodeResult {
+                    status: 1,
+                    bytes_read: 0,
+                    bytes_written: 0,
+                };
+            }
+        }
+    };
+
+    // For UTF-8 encoding, just copy directly (no NCR needed)
+    if encoding == encoding_rs::UTF_8 {
+        if input_len > output_capacity {
+            return EncodeResult {
+                bytes_read: 0,
+                bytes_written: 0,
+                status: 1,
+            };
+        }
+        let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) };
+        output_slice[..input_len].copy_from_slice(input_str.as_bytes());
+        return EncodeResult {
+            bytes_read: input_len,
+            bytes_written: input_len,
+            status: 0,
+        };
+    }
+
+    let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) };
+    let mut encoder = encoding.new_encoder();
+
+    // encode_from_utf8 automatically produces NCRs for unmappable characters
+    let (result, bytes_read, bytes_written, _had_unmappables) =
+        encoder.encode_from_utf8(input_str, output_slice, true);
+
+    match result {
+        encoding_rs::CoderResult::InputEmpty => EncodeResult {
+            bytes_read,
+            bytes_written,
+            status: 0,
+        },
+        encoding_rs::CoderResult::OutputFull => EncodeResult {
+            bytes_read,
+            bytes_written,
+            status: 1,
+        },
+    }
+}
+
+/// Calculate maximum output buffer size needed for encoding with NCR fallback.
+/// Worst case: every character becomes &#codepoint; where codepoint is up to 7 digits.
+#[no_mangle]
+pub extern "C" fn encoding_max_encode_buffer_length(
+    handle: *const c_void,
+    input_len: usize,
+) -> usize {
+    if handle.is_null() {
+        return 0;
+    }
+    let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
+    let encoder = encoding.new_encoder();
+    // This returns the max buffer size accounting for NCR expansion
+    encoder
+        .max_buffer_length_from_utf8_if_no_unmappables(input_len)
+        .map(|len| {
+            // Add extra space for potential NCRs (each char could become &#nnnnnn; = 10 bytes)
+            // But realistically, most chars are mappable, so add 2x as safety margin
+            len.saturating_mul(2)
+        })
+        .unwrap_or(input_len * 10)
+}
+
 #[no_mangle]
 pub extern "C" fn html5ever_parse_fragment(
     html: *mut c_uchar,
diff --git a/src/lightpanda.zig b/src/lightpanda.zig
index 4d6c23fb..b0356e93 100644
--- a/src/lightpanda.zig
+++ b/src/lightpanda.zig
@@ -107,7 +107,7 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void {
     //     }
     // }
 
-    const encoded_url = try URL.ensureEncoded(page.call_arena, url);
+    const encoded_url = try URL.ensureEncoded(page.call_arena, url, "UTF-8");
     _ = try page.navigate(encoded_url, .{
         .reason = .address_bar,
         .kind = .{ .push = null },

From a5bf1f07afdb6b1d2bcff15702022d0a297144e4 Mon Sep 17 00:00:00 2001
From: Karl Seguin <k@openmymind.io>
Date: Fri, 10 Apr 2026 15:09:32 +0800
Subject: [PATCH 4/5] chore: trigger CI


From 7c6624014683e845708db81b45e77b2930e6cb56 Mon Sep 17 00:00:00 2001
From: Karl Seguin <k@openmymind.io>
Date: Fri, 10 Apr 2026 15:41:38 +0800
Subject: [PATCH 5/5] chore: trigger CI