From 828715b751b19190f36c13f5e9fdcb7dee308730 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Fri, 10 Apr 2026 10:36:29 +0800 Subject: [PATCH 1/5] Improve TextDecoder to support all necessary encoding types Uses the newly added encoding_rs to implement TextDecoder for all encoding. Claude wrote 100% of the Rust binding. Improves various WPT tests, e.g. /encoding/api-basics.any.html. --- src/browser/parser/html5ever.zig | 62 +++++++ src/browser/tests/encoding/text_decoder.html | 73 +++++++- src/browser/tests/encoding/text_encoder.html | 3 + src/browser/tests/testing.js | 8 +- src/browser/webapi/encoding/TextDecoder.zig | 149 ++++++++++++--- src/browser/webapi/encoding/TextEncoder.zig | 19 +- src/html5ever/lib.rs | 183 +++++++++++++++++++ 7 files changed, 458 insertions(+), 39 deletions(-) diff --git a/src/browser/parser/html5ever.zig b/src/browser/parser/html5ever.zig index f6f81583..cb673789 100644 --- a/src/browser/parser/html5ever.zig +++ b/src/browser/parser/html5ever.zig @@ -216,3 +216,65 @@ pub extern "c" fn xml5ever_parse_document( appendBeforeSiblingCallback: *const fn (ctx: *anyopaque, sibling_ref: *anyopaque, NodeOrText) callconv(.c) void, appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void, ) void; + +// General encoding api +pub const EncodingInfo = extern struct { + found: u8, + handle: ?*anyopaque, + name_len: usize, + name_ptr: [*]const u8, + + pub fn isValid(self: *const EncodingInfo) bool { + return self.found != 0; + } + + pub fn name(self: *const EncodingInfo) []const u8 { + if (self.name_len == 0) { + return ""; + } + return self.name_ptr[0..self.name_len]; + } +}; + +pub const DecodeResult = extern struct { + had_errors: u8, + bytes_read: usize, + bytes_written: usize, + + pub fn hadErrors(self: *const DecodeResult) bool { + return self.had_errors != 0; + } +}; + +pub extern "c" fn encoding_for_label( + label: [*]const u8, + label_len: usize, +) EncodingInfo; + +pub extern "c" fn encoding_max_utf8_buffer_length( + handle: *anyopaque, + input_len: usize, +) usize; + +pub extern "c" fn encoding_decode( + handle: *anyopaque, + input: ?[*]const u8, + input_len: usize, + output: [*]u8, + output_len: usize, + is_last: u8, +) DecodeResult; + +// Streaming decoder API +pub extern "c" fn encoding_decoder_new(handle: *anyopaque) ?*anyopaque; + +pub extern "c" fn encoding_decoder_decode( + decoder: *anyopaque, + input: ?[*]const u8, + input_len: usize, + output: [*]u8, + output_len: usize, + is_last: u8, +) DecodeResult; + +pub extern "c" fn encoding_decoder_free(decoder: *anyopaque) void; diff --git a/src/browser/tests/encoding/text_decoder.html b/src/browser/tests/encoding/text_decoder.html index 2b01852e..6314c924 100644 --- a/src/browser/tests/encoding/text_decoder.html +++ b/src/browser/tests/encoding/text_decoder.html @@ -11,7 +11,6 @@ testing.expectEqual('', d1.decode()); testing.expectEqual('香料', d1.decode(new Uint8Array([233, 166, 153, 230, 150, 153]))); testing.expectEqual('香料', d1.decode(new Uint8Array([0xEF, 0xBB, 0xBF, 233, 166, 153, 230, 150, 153]))); - testing.expectEqual('�4', d1.decode(new Uint8Array([249, 52]))); { const buffer = new ArrayBuffer(6); @@ -38,7 +37,7 @@ } let d2 = new TextDecoder('utf8', {fatal: true}) - testing.expectError('Error: InvalidUtf8', () => { + testing.expectError('TypeError', () => { let data = new Uint8Array([241, 241, 159, 172]); d2.decode(data); }); @@ -46,8 +45,8 @@ + + + + + + diff --git a/src/browser/tests/encoding/text_encoder.html b/src/browser/tests/encoding/text_encoder.html index 540f60c1..99fd1959 100644 --- a/src/browser/tests/encoding/text_encoder.html +++ b/src/browser/tests/encoding/text_encoder.html @@ -5,6 +5,9 @@ diff --git a/src/browser/tests/testing.js b/src/browser/tests/testing.js index 037d15cf..12d0f761 100644 --- a/src/browser/tests/testing.js +++ b/src/browser/tests/testing.js @@ -37,7 +37,13 @@ function expectError(expected, fn) { withError((err) => { - expectEqual(true, err.toString().includes(expected)); + if (!err.toString().includes(expected)) { + console.error(`Expecte error to contains: ${expected}, was: ${err.toString()}`); + expectEqual(true, false); + } else { + // to record a successful case + expectTrue(true); + } }, fn); } diff --git a/src/browser/webapi/encoding/TextDecoder.zig b/src/browser/webapi/encoding/TextDecoder.zig index c117df09..1467aa86 100644 --- a/src/browser/webapi/encoding/TextDecoder.zig +++ b/src/browser/webapi/encoding/TextDecoder.zig @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2025 Lightpanda (Selecy SAS) +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) // // Francis Bouvier // Pierre Tachoire @@ -19,6 +19,7 @@ const std = @import("std"); const lp = @import("lightpanda"); const js = @import("../../js/js.zig"); +const html5ever = @import("../../parser/html5ever.zig"); const Page = @import("../../Page.zig"); const Session = @import("../../Session.zig"); @@ -30,13 +31,11 @@ _rc: lp.RC(u8) = .{}, _fatal: bool, _arena: Allocator, _ignore_bom: bool, -_stream: std.ArrayList(u8), - -const Label = enum { - utf8, - @"utf-8", - @"unicode-1-1-utf-8", -}; +_bom_seen: bool, +_decoder: ?*anyopaque, // Persistent streaming decoder +_encoding_handle: *anyopaque, +_encoding_name: []const u8, +_lowercase_name: []const u8, // Cached lowercase version of encoding name const InitOpts = struct { fatal: bool = false, @@ -44,8 +43,17 @@ const InitOpts = struct { }; pub fn init(label_: ?[]const u8, opts_: ?InitOpts, page: *Page) !*TextDecoder { - if (label_) |label| { - _ = std.meta.stringToEnum(Label, label) orelse return error.RangeError; + const label = label_ orelse "utf-8"; + + const info = html5ever.encoding_for_label(label.ptr, label.len); + if (!info.isValid()) { + return error.RangeError; + } + + // Check for "replacement" encoding - it's not usable for decoding per spec + const enc_name = info.name(); + if (std.mem.eql(u8, enc_name, "replacement")) { + return error.RangeError; } const arena = try page.getArena(.{ .debug = "TextDecoder" }); @@ -55,14 +63,21 @@ pub fn init(label_: ?[]const u8, opts_: ?InitOpts, page: *Page) !*TextDecoder { const self = try arena.create(TextDecoder); self.* = .{ ._arena = arena, - ._stream = .empty, ._fatal = opts.fatal, ._ignore_bom = opts.ignoreBOM, + ._encoding_handle = info.handle.?, + ._decoder = null, + ._bom_seen = false, + ._lowercase_name = "", // Will be lazily allocated + ._encoding_name = enc_name, // Points to static Rust memory }; return self; } pub fn deinit(self: *TextDecoder, session: *Session) void { + if (self._decoder) |decoder| { + html5ever.encoding_decoder_free(decoder); + } session.releaseArena(self._arena); } @@ -82,34 +97,110 @@ pub fn getFatal(self: *const TextDecoder) bool { return self._fatal; } +pub fn getEncoding(self: *TextDecoder) ![]const u8 { + // Spec requires lowercase encoding name + // Allocate buffer for lowercase name on first access + if (self._lowercase_name.len > 0) { + return self._lowercase_name; + } + self._lowercase_name = try std.ascii.allocLowerString(self._arena, self._encoding_name); + return self._lowercase_name; +} + const DecodeOpts = struct { stream: bool = false, }; + pub fn decode(self: *TextDecoder, input_: ?[]const u8, opts_: ?DecodeOpts) ![]const u8 { - var input = input_ orelse return ""; const opts: DecodeOpts = opts_ orelse .{}; + const input = input_ orelse ""; - if (self._stream.items.len > 0) { - try self._stream.appendSlice(self._arena, input); - input = self._stream.items; - } - - if (self._fatal and !std.unicode.utf8ValidateSlice(input)) { - if (opts.stream) { - if (self._stream.items.len == 0) { - try self._stream.appendSlice(self._arena, input); - } - return ""; + // For non-streaming calls, we don't need a persistent decoder + if (!opts.stream) { + // Reset decoder state if we had one + if (self._decoder) |decoder| { + html5ever.encoding_decoder_free(decoder); + self._decoder = null; + } + } else if (self._decoder == null) { + self._decoder = html5ever.encoding_decoder_new(self._encoding_handle); + if (self._decoder == null) { + return error.OutOfMemory; } - return error.InvalidUtf8; } - self._stream.clearRetainingCapacity(); - if (self._ignore_bom == false and std.mem.startsWith(u8, input, &.{ 0xEF, 0xBB, 0xBF })) { - return input[3..]; + return self._decode(input, self._decoder); +} + +fn _decode(self: *TextDecoder, input: []const u8, streaming_decoder: ?*anyopaque) ![]const u8 { + if (input.len == 0) { + return ""; } - return input; + // Calculate max output size + const max_out = html5ever.encoding_max_utf8_buffer_length( + self._encoding_handle, + input.len, + ); + + if (max_out == 0) { + return ""; + } + + // Allocate output buffer + const output = try self._arena.alloc(u8, max_out); + + // Decode using either streaming or one-shot decoder + const result = if (streaming_decoder) |decoder| + html5ever.encoding_decoder_decode( + decoder, + input.ptr, + input.len, + output.ptr, + output.len, + 0, // is_last = false for streaming + ) + else + html5ever.encoding_decode( + self._encoding_handle, + input.ptr, + input.len, + output.ptr, + output.len, + 1, // is_last = true for one-shot + ); + + // Handle errors in fatal mode + if (self._fatal and result.hadErrors()) { + if (streaming_decoder != null) { + // Reset decoder on error + if (self._decoder) |decoder| { + html5ever.encoding_decoder_free(decoder); + self._decoder = null; + } + } + self._bom_seen = false; + return error.TypeError; + } + + var decoded: []const u8 = output[0..result.bytes_written]; + + // Handle BOM stripping + if (!self._bom_seen and !self._ignore_bom) { + decoded = stripBom(decoded); + self._bom_seen = true; + } + + return decoded; +} + +fn stripBom(data: []const u8) []const u8 { + // UTF-8 BOM in decoded output appears as U+FEFF (EF BB BF in UTF-8) + const bom = "\u{FEFF}"; + if (std.mem.startsWith(u8, data, bom)) { + return data[bom.len..]; + } + return data; } pub const JsApi = struct { @@ -123,7 +214,7 @@ pub const JsApi = struct { pub const constructor = bridge.constructor(TextDecoder.init, .{}); pub const decode = bridge.function(TextDecoder.decode, .{}); - pub const encoding = bridge.property("utf-8", .{ .template = false }); + pub const encoding = bridge.accessor(TextDecoder.getEncoding, null, .{}); pub const fatal = bridge.accessor(TextDecoder.getFatal, null, .{}); pub const ignoreBOM = bridge.accessor(TextDecoder.getIgnoreBOM, null, .{}); }; diff --git a/src/browser/webapi/encoding/TextEncoder.zig b/src/browser/webapi/encoding/TextEncoder.zig index a6bff48e..112d2e32 100644 --- a/src/browser/webapi/encoding/TextEncoder.zig +++ b/src/browser/webapi/encoding/TextEncoder.zig @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2025 Lightpanda (Selecy SAS) +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) // // Francis Bouvier // Pierre Tachoire @@ -26,12 +26,23 @@ pub fn init() TextEncoder { return .{}; } -pub fn encode(_: *const TextEncoder, v: []const u8) !js.TypedArray(u8) { - if (!std.unicode.utf8ValidateSlice(v)) { +pub fn encode(_: *const TextEncoder, v_: ?js.Value) !js.TypedArray(u8) { + const v = v_ orelse return .{ .values = "" }; + + if (v.isUndefined()) { + return .{ .values = "" }; + } + + if (v.isNull()) { + return .{ .values = "null" }; + } + + const str = try v.toStringSlice(); + if (!std.unicode.utf8ValidateSlice(str)) { return error.InvalidUtf8; } - return .{ .values = v }; + return .{ .values = str }; } pub const JsApi = struct { diff --git a/src/html5ever/lib.rs b/src/html5ever/lib.rs index 6fab9763..c684c039 100644 --- a/src/html5ever/lib.rs +++ b/src/html5ever/lib.rs @@ -151,6 +151,189 @@ pub extern "C" fn html5ever_parse_document_with_encoding( .one(StrTendril::from(decoded.as_ref())); } +// === Encoding API for TextDecoder === + +/// Result of encoding label lookup +#[repr(C)] +pub struct EncodingInfo { + /// 0 = not found, 1 = found + pub found: u8, + /// Opaque handle to the encoding (actually &'static Encoding) + pub handle: *const c_void, + /// Length of canonical name + pub name_len: usize, + /// Pointer to canonical encoding name (static, lowercase) + pub name_ptr: *const c_uchar, +} + +/// Look up an encoding by its label (case-insensitive, whitespace-trimmed) +#[no_mangle] +pub extern "C" fn encoding_for_label( + label: *const c_uchar, + label_len: usize, +) -> EncodingInfo { + if label.is_null() || label_len == 0 { + return EncodingInfo { + found: 0, + name_len: 0, + handle: std::ptr::null(), + name_ptr: std::ptr::null(), + }; + } + + let label_bytes = unsafe { std::slice::from_raw_parts(label, label_len) }; + + match Encoding::for_label(label_bytes) { + Some(encoding) => { + let name = encoding.name(); + EncodingInfo { + found: 1, + name_len: name.len(), + name_ptr: name.as_ptr(), + handle: encoding as *const _ as *const c_void, + } + } + None => EncodingInfo { + found: 0, + name_len: 0, + name_ptr: std::ptr::null(), + handle: std::ptr::null(), + }, + } +} + +/// Calculate maximum UTF-8 buffer size needed for decoding +#[no_mangle] +pub extern "C" fn encoding_max_utf8_buffer_length( + handle: *const c_void, + input_len: usize, +) -> usize { + if handle.is_null() { + return 0; + } + let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; + let decoder = encoding.new_decoder(); + decoder.max_utf8_buffer_length(input_len).unwrap_or(0) +} + +/// Result of decoding operation +#[repr(C)] +pub struct DecodeResult { + /// 0 = no errors, 1 = had malformed sequences (replaced with U+FFFD) + pub had_errors: u8, + /// Number of input bytes consumed + pub bytes_read: usize, + /// Number of UTF-8 bytes written to output buffer + pub bytes_written: usize, +} + +/// Decode bytes from source encoding to UTF-8 +/// For streaming, set is_last=0; for final/complete decode, set is_last=1 +#[no_mangle] +pub extern "C" fn encoding_decode( + handle: *const c_void, + input: *const c_uchar, + input_len: usize, + output: *mut c_uchar, + output_len: usize, + is_last: u8, +) -> DecodeResult { + if handle.is_null() || output.is_null() { + return DecodeResult { + had_errors: 1, + bytes_read: 0, + bytes_written: 0, + }; + } + + let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; + let input_bytes = if input.is_null() || input_len == 0 { + &[] + } else { + unsafe { std::slice::from_raw_parts(input, input_len) } + }; + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_len) }; + + let mut decoder = encoding.new_decoder(); + let last = is_last != 0; + + let (result, bytes_read, bytes_written, had_errors) = + decoder.decode_to_utf8(input_bytes, output_slice, last); + + // If output buffer was too small, we still report what we could process + let _ = result; // CoderResult::InputEmpty or CoderResult::OutputFull + + DecodeResult { + had_errors: if had_errors { 1 } else { 0 }, + bytes_read, + bytes_written, + } +} + +// === Streaming Decoder API === + +use encoding_rs::Decoder; + +/// Create a streaming decoder that maintains state across calls +#[no_mangle] +pub extern "C" fn encoding_decoder_new(handle: *const c_void) -> *mut c_void { + if handle.is_null() { + return std::ptr::null_mut(); + } + let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; + let decoder = Box::new(encoding.new_decoder()); + Box::into_raw(decoder) as *mut c_void +} + +/// Decode using a streaming decoder (maintains state for incomplete sequences) +#[no_mangle] +pub extern "C" fn encoding_decoder_decode( + decoder_ptr: *mut c_void, + input: *const c_uchar, + input_len: usize, + output: *mut c_uchar, + output_len: usize, + is_last: u8, +) -> DecodeResult { + if decoder_ptr.is_null() || output.is_null() { + return DecodeResult { + had_errors: 1, + bytes_read: 0, + bytes_written: 0, + }; + } + + let decoder: &mut Decoder = unsafe { &mut *(decoder_ptr as *mut Decoder) }; + let input_bytes = if input.is_null() || input_len == 0 { + &[] + } else { + unsafe { std::slice::from_raw_parts(input, input_len) } + }; + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_len) }; + + let last = is_last != 0; + let (result, bytes_read, bytes_written, had_errors) = + decoder.decode_to_utf8(input_bytes, output_slice, last); + + let _ = result; + + DecodeResult { + had_errors: if had_errors { 1 } else { 0 }, + bytes_read, + bytes_written, + } +} + +/// Free a streaming decoder +#[no_mangle] +pub extern "C" fn encoding_decoder_free(decoder_ptr: *mut c_void) { + if !decoder_ptr.is_null() { + unsafe { + drop(Box::from_raw(decoder_ptr as *mut Decoder)); + } + } +} + #[no_mangle] pub extern "C" fn html5ever_parse_fragment( html: *mut c_uchar, From f7c1710c2354d19a1625b2ae969cc36bb9f578a4 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Fri, 10 Apr 2026 12:02:06 +0800 Subject: [PATCH 2/5] Expose correct charset document.characterSet, document.charset and document.inputEncoding now exposes the correct charset. --- src/browser/Page.zig | 41 +++++++++++------------- src/browser/tests/document/document.html | 4 +++ src/browser/tests/page/encoding.html | 4 +++ src/browser/webapi/Document.zig | 11 +++++-- 4 files changed, 34 insertions(+), 26 deletions(-) diff --git a/src/browser/Page.zig b/src/browser/Page.zig index f12b606b..7c66faff 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -207,6 +207,9 @@ base_url: ?[:0]const u8 = null, // referer header cache. referer_header: ?[:0]const u8 = null, +// Document charset (canonical name from encoding_rs, static lifetime) +charset: []const u8 = "UTF-8", + // Arbitrary buffer. Need to temporarily lowercase a value? Use this. No lifetime // guarantee - it's valid until someone else uses it. buf: [BUF_SIZE]u8 = undefined, @@ -962,9 +965,13 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void { switch (mime.content_type) { .text_html => { - self._parse_state = .{ .html = .{ - .mime = mime, - } }; + // Normalize and store the charset using encoding_rs canonical names + const charset_str = mime.charsetString(); + const info = h5e.encoding_for_label(charset_str.ptr, charset_str.len); + if (info.isValid()) { + self.charset = info.name(); + } + self._parse_state = .{ .html = .empty }; }, .application_json, .text_javascript, .text_css, .text_plain => { var arr: std.ArrayList(u8) = .empty; @@ -979,7 +986,7 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void { } switch (self._parse_state) { - .html => |*html| try html.buf.appendSlice(self.arena, data), + .html => |*html| try html.appendSlice(self.arena, data), .text => |*buf| { // we have to escape the data... var v = data; @@ -1028,12 +1035,13 @@ fn pageDoneCallback(ctx: *anyopaque) !void { var parser = Parser.init(parse_arena, self.document.asNode(), self); switch (self._parse_state) { - .html => |*html_state| { - const raw_html = html_state.buf.items; - if (html_state.needsEncodingConversion()) { - parser.parseWithEncoding(raw_html, html_state.mime.charsetString()); - } else { + .html => |*html_buf| { + const raw_html = html_buf.items; + + if (std.mem.eql(u8, self.charset, "UTF-8")) { parser.parse(raw_html); + } else { + parser.parseWithEncoding(raw_html, self.charset); } self._script_manager.staticScriptsDone(); self._parse_state = .complete; @@ -3164,21 +3172,11 @@ const ParseState = union(enum) { pre, complete, err: anyerror, - html: Html, + html: std.ArrayList(u8), text: std.ArrayList(u8), image: std.ArrayList(u8), raw: std.ArrayList(u8), raw_done: []const u8, - - const Html = struct { - mime: Mime, - buf: std.ArrayList(u8) = .empty, - - fn needsEncodingConversion(self: *const Html) bool { - const charset = self.mime.charsetString(); - return !std.ascii.eqlIgnoreCase(charset, "utf-8") and !std.ascii.eqlIgnoreCase(charset, "utf8"); - } - }; }; const LoadState = enum { @@ -3628,9 +3626,6 @@ fn asUint(comptime string: anytype) std.meta.Int( const testing = @import("../testing.zig"); test "WebApi: Page" { - const filter: testing.LogFilter = .init(&.{ .http, .js }); - defer filter.deinit(); - try testing.htmlRunner("page", .{}); } diff --git a/src/browser/tests/document/document.html b/src/browser/tests/document/document.html index 74d8ff30..ede2b507 100644 --- a/src/browser/tests/document/document.html +++ b/src/browser/tests/document/document.html @@ -18,6 +18,10 @@ testing.expectEqual("visible", document.visibilityState); testing.expectEqual(false, document.prerendering); testing.expectEqual(undefined, Document.prerendering); + // characterSet should return canonical encoding name + testing.expectEqual("UTF-8", document.characterSet); + testing.expectEqual("UTF-8", document.charset); + testing.expectEqual("UTF-8", document.inputEncoding); diff --git a/src/browser/webapi/Document.zig b/src/browser/webapi/Document.zig index cf15c49b..6b48e4c3 100644 --- a/src/browser/webapi/Document.zig +++ b/src/browser/webapi/Document.zig @@ -1068,10 +1068,15 @@ pub const JsApi = struct { pub const hasFocus = bridge.function(Document.hasFocus, .{}); pub const prerendering = bridge.property(false, .{ .template = false }); - pub const characterSet = bridge.property("UTF-8", .{ .template = false }); - pub const charset = bridge.property("UTF-8", .{ .template = false }); - pub const inputEncoding = bridge.property("UTF-8", .{ .template = false }); + pub const characterSet = bridge.accessor(getCharacterSet, null, .{}); + pub const charset = bridge.accessor(getCharacterSet, null, .{}); + pub const inputEncoding = bridge.accessor(getCharacterSet, null, .{}); pub const compatMode = bridge.property("CSS1Compat", .{ .template = false }); + + fn getCharacterSet(self: *const Document) []const u8 { + const doc_page = self._page orelse return "UTF-8"; + return doc_page.charset; + } pub const referrer = bridge.property("", .{ .template = false }); }; From 05229fdc536645f9f21e2f40d5d2dfa2c5ed46e3 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Fri, 10 Apr 2026 14:41:19 +0800 Subject: [PATCH 3/5] Use the document's charset to determine if/how to encode querystring Whenever we resolve a URL, say from `anchor.href`, we should consider the document's charset when encoding the querystring. This probably isn't the most important feature, but it makes tens of thousands of WPT cases pass, e.g /encoding/legacy-mb-tchinese/big5/big5-encode-href-errors-han.html?3001-4000 and /encoding/legacy-mb-japanese/euc-jp/eucjp-encode-href-errors-han.html?17001-18000 DOM elements previous called `URL.resolveURL(...)`. They now call `self.asNode().resolveURL(...)`, where `Node#resolveURL` will provide the document's charset. --- src/SemanticTree.zig | 2 +- src/browser/Page.zig | 4 +- src/browser/URL.zig | 91 ++++++++++++---- src/browser/interactive.zig | 2 +- src/browser/markdown.zig | 6 +- src/browser/parser/html5ever.zig | 24 +++++ src/browser/structured_data.zig | 2 +- src/browser/tests/page/encoding.html | 29 ++++++ src/browser/webapi/Node.zig | 13 +++ src/browser/webapi/element/html/Anchor.zig | 7 +- src/browser/webapi/element/html/Form.zig | 2 +- src/browser/webapi/element/html/IFrame.zig | 4 +- src/browser/webapi/element/html/Image.zig | 4 +- src/browser/webapi/element/html/Link.zig | 4 +- src/browser/webapi/element/html/Media.zig | 3 +- src/browser/webapi/element/html/Script.zig | 4 +- src/browser/webapi/element/html/Video.zig | 4 +- src/browser/webapi/net/WebSocket.zig | 2 +- src/browser/webapi/net/XMLHttpRequest.zig | 2 +- src/cdp/domains/page.zig | 2 +- src/cdp/domains/target.zig | 2 +- src/html5ever/lib.rs | 114 +++++++++++++++++++++ src/lightpanda.zig | 2 +- 23 files changed, 276 insertions(+), 53 deletions(-) diff --git a/src/SemanticTree.zig b/src/SemanticTree.zig index 9bca520e..5b3f0ae5 100644 --- a/src/SemanticTree.zig +++ b/src/SemanticTree.zig @@ -671,7 +671,7 @@ pub fn getNodeDetails( if (el.getAttributeSafe(comptime .wrap("href"))) |h| { const URL = lp.URL; - href = URL.resolve(arena, page.base(), h, .{ .encode = true }) catch h; + href = URL.resolve(arena, page.base(), h, .{ .encoding = page.charset }) catch h; } if (el.is(Element.Html.Input)) |input| { diff --git a/src/browser/Page.zig b/src/browser/Page.zig index 7c66faff..1c3d39f0 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -661,7 +661,7 @@ fn scheduleNavigationWithArena(originator: *Page, arena: Allocator, request_url: arena, page_base, request_url, - .{ .always_dupe = true, .encode = true }, + .{ .always_dupe = true, .encoding = originator.charset }, ); break :blk .{ u, false }; }; @@ -1196,7 +1196,7 @@ pub fn iframeAddedCallback(self: *Page, iframe: *IFrame) !void { self.call_arena, // ok to use, page.navigate dupes this self.base(), src, - .{ .encode = true }, + .{ .encoding = self.charset }, ); }; diff --git a/src/browser/URL.zig b/src/browser/URL.zig index 6f8cbebd..532f11a1 100644 --- a/src/browser/URL.zig +++ b/src/browser/URL.zig @@ -19,16 +19,19 @@ const std = @import("std"); const Allocator = std.mem.Allocator; -const ResolveOpts = struct { - encode: bool = false, +pub const ResolveOpts = struct { + /// null = don't encode, "UTF-8" = standard percent encoding, + /// other charset = encode query string using that charset with NCR fallback + encoding: ?[]const u8 = null, always_dupe: bool = false, }; // path is anytype, so that it can be used with both []const u8 and [:0]const u8 -pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, comptime opts: ResolveOpts) ![:0]const u8 { +pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, opts: ResolveOpts) ![:0]const u8 { const PT = @TypeOf(source_path); - var path: [:0]const u8 = if (comptime !isNullTerminated(PT) or opts.always_dupe) try allocator.dupeZ(u8, source_path) else source_path; + const needs_dupe = comptime !isNullTerminated(PT); + var path: [:0]const u8 = if (needs_dupe or opts.always_dupe) try allocator.dupeZ(u8, source_path) else source_path; if (base.len == 0) { return processResolved(allocator, path, opts); @@ -186,14 +189,12 @@ pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, c return processResolved(allocator, out[0..out_i :0], opts); } -fn processResolved(allocator: Allocator, url: [:0]const u8, comptime opts: ResolveOpts) ![:0]const u8 { - if (!comptime opts.encode) { - return url; - } - return ensureEncoded(allocator, url); +fn processResolved(allocator: Allocator, url: [:0]const u8, opts: ResolveOpts) ![:0]const u8 { + const encoding = opts.encoding orelse return url; + return ensureEncoded(allocator, url, encoding); } -pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 { +pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8, encoding: []const u8) ![:0]const u8 { const scheme_end = std.mem.indexOf(u8, url, "://"); const authority_start = if (scheme_end) |end| end + 3 else 0; const path_start = std.mem.indexOfScalarPos(u8, url, authority_start, '/') orelse return url; @@ -205,18 +206,18 @@ pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 { const query_end = if (query_start) |_| (fragment_start orelse url.len) else path_end; const path_to_encode = url[path_start..path_end]; + // Path is always UTF-8 percent encoded per URL spec const encoded_path = try percentEncodeSegment(allocator, path_to_encode, .path); + // Query string uses document encoding const encoded_query = if (query_start) |qs| blk: { const query_to_encode = url[qs + 1 .. query_end]; - const encoded = try percentEncodeSegment(allocator, query_to_encode, .query); - break :blk encoded; + break :blk try encodeQueryString(allocator, query_to_encode, encoding); } else null; const encoded_fragment = if (fragment_start) |fs| blk: { const fragment_to_encode = url[fs + 1 ..]; - const encoded = try percentEncodeSegment(allocator, fragment_to_encode, .query); - break :blk encoded; + break :blk try percentEncodeSegment(allocator, fragment_to_encode, .query); } else null; if (encoded_path.ptr == path_to_encode.ptr and @@ -242,7 +243,7 @@ pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 { return buf.items[0 .. buf.items.len - 1 :0]; } -const EncodeSet = enum { path, query, userinfo, fragment }; +const EncodeSet = enum { path, query, query_legacy, userinfo, fragment }; fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime encode_set: EncodeSet) ![]const u8 { // Check if encoding is needed @@ -285,17 +286,65 @@ fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime enco return buf.items; } +const h5e = @import("parser/html5ever.zig"); + +/// Encode a query string using the specified encoding. +/// For UTF-8, this is standard percent encoding. +/// For legacy encodings, unmappable characters are replaced with NCRs (&#codepoint;). +fn encodeQueryString(allocator: Allocator, query: []const u8, encoding: []const u8) ![]const u8 { + // For UTF-8, use standard percent encoding + if (std.mem.eql(u8, encoding, "UTF-8")) { + return percentEncodeSegment(allocator, query, .query); + } + + // For legacy encodings, first encode to the target charset with NCR fallback + const enc_info = h5e.encoding_for_label(encoding.ptr, encoding.len); + if (!enc_info.isValid()) { + // Unknown encoding, fall back to UTF-8 + return percentEncodeSegment(allocator, query, .query); + } + + // Calculate max buffer size for encoded output + const max_encoded_len = h5e.encoding_max_encode_buffer_length(enc_info.handle.?, query.len); + if (max_encoded_len == 0) { + return percentEncodeSegment(allocator, query, .query); + } + + const encode_buf = try allocator.alloc(u8, max_encoded_len); + defer allocator.free(encode_buf); + + // Encode UTF-8 to legacy encoding with NCR fallback + const result = h5e.encoding_encode_with_ncr( + enc_info.handle.?, + query.ptr, + query.len, + encode_buf.ptr, + encode_buf.len, + ); + + if (!result.isSuccess()) { + // Encoding failed, fall back to UTF-8 + return percentEncodeSegment(allocator, query, .query); + } + + // Now percent-encode the result using query_legacy to preserve NCRs + const encoded_bytes = encode_buf[0..result.bytes_written]; + return percentEncodeSegment(allocator, encoded_bytes, .query_legacy); +} + fn shouldPercentEncode(c: u8, comptime encode_set: EncodeSet) bool { return switch (c) { // Unreserved characters (RFC 3986) 'A'...'Z', 'a'...'z', '0'...'9', '-', '.', '_', '~' => false, - // sub-delims allowed in path/query but some must be encoded in userinfo - '!', '$', '&', '\'', '(', ')', '*', '+', ',' => false, - ';', '=' => encode_set == .userinfo, + // sub-delims allowed in path/query but some must be encoded in userinfo/query_legacy + '!', '$', '\'', '(', ')', '*', '+', ',' => false, + // '&' and ';' must be encoded for legacy encoding (to preserve NCRs like &#nnnnn;) + '&', ';' => encode_set == .userinfo or encode_set == .query_legacy, + '=' => encode_set == .userinfo, // Separators: userinfo must encode these '/', ':', '@' => encode_set == .userinfo, // '?' is allowed in queries only - '?' => encode_set != .query, + '?' => encode_set != .query and encode_set != .query_legacy, // '#' is allowed in fragments only '#' => encode_set != .fragment, // Everything else needs encoding (including space) @@ -1130,7 +1179,7 @@ test "URL: ensureEncoded" { }; for (cases) |case| { - const result = try ensureEncoded(testing.arena_allocator, case.url); + const result = try ensureEncoded(testing.arena_allocator, case.url, "UTF-8"); try testing.expectString(case.expected, result); } } @@ -1296,7 +1345,7 @@ test "URL: resolve with encoding" { }; for (cases) |case| { - const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encode = true }); + const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encoding = "UTF-8" }); try testing.expectString(case.expected, result); } } diff --git a/src/browser/interactive.zig b/src/browser/interactive.zig index a0b4528a..225633c7 100644 --- a/src/browser/interactive.zig +++ b/src/browser/interactive.zig @@ -182,7 +182,7 @@ pub fn collectInteractiveElements( .id = el.getAttributeSafe(comptime .wrap("id")), .class = el.getAttributeSafe(comptime .wrap("class")), .href = if (el.getAttributeSafe(comptime .wrap("href"))) |href| - URL.resolve(arena, page.base(), href, .{ .encode = true }) catch href + URL.resolve(arena, page.base(), href, .{ .encoding = page.charset }) catch href else null, .input_type = getInputType(el), diff --git a/src/browser/markdown.zig b/src/browser/markdown.zig index 5a83dfdc..437dbee6 100644 --- a/src/browser/markdown.zig +++ b/src/browser/markdown.zig @@ -278,7 +278,8 @@ const Context = struct { } try self.writer.writeAll("]("); if (el.getAttributeSafe(comptime .wrap("src"))) |src| { - const absolute_src = URL.resolve(self.page.call_arena, self.page.base(), src, .{ .encode = true }) catch src; + const page = self.page; + const absolute_src = URL.resolve(page.call_arena, page.base(), src, .{ .encoding = page.charset }) catch src; try self.writer.writeAll(absolute_src); } try self.writer.writeAll(")"); @@ -286,13 +287,14 @@ const Context = struct { return; }, .anchor => { + const page = self.page; const info = analyzeContent(el.asNode()); const label = getAnchorLabel(el); const href_raw = el.getAttributeSafe(comptime .wrap("href")); if (!info.has_visible and label == null and href_raw == null) return; - const href = if (href_raw) |h| URL.resolve(self.page.call_arena, self.page.base(), h, .{ .encode = true }) catch h else null; + const href = if (href_raw) |h| URL.resolve(page.call_arena, page.base(), h, .{ .encoding = page.charset }) catch h else null; if (info.has_block) { try self.renderChildren(el.asNode()); diff --git a/src/browser/parser/html5ever.zig b/src/browser/parser/html5ever.zig index cb673789..829ac429 100644 --- a/src/browser/parser/html5ever.zig +++ b/src/browser/parser/html5ever.zig @@ -278,3 +278,27 @@ pub extern "c" fn encoding_decoder_decode( ) DecodeResult; pub extern "c" fn encoding_decoder_free(decoder: *anyopaque) void; + +// Encoding API (UTF-8 to legacy encoding with NCR fallback) +pub const EncodeResult = extern struct { + status: u8, + bytes_read: usize, + bytes_written: usize, + + pub fn isSuccess(self: *const EncodeResult) bool { + return self.status == 0; + } +}; + +pub extern "c" fn encoding_encode_with_ncr( + handle: *anyopaque, + input: ?[*]const u8, + input_len: usize, + output: [*]u8, + output_capacity: usize, +) EncodeResult; + +pub extern "c" fn encoding_max_encode_buffer_length( + handle: *anyopaque, + input_len: usize, +) usize; diff --git a/src/browser/structured_data.zig b/src/browser/structured_data.zig index 9b6e7fbe..cad1d9d8 100644 --- a/src/browser/structured_data.zig +++ b/src/browser/structured_data.zig @@ -288,7 +288,7 @@ fn collectLink( ) !void { const rel = el.getAttributeSafe(comptime .wrap("rel")) orelse return; const raw_href = el.getAttributeSafe(comptime .wrap("href")) orelse return; - const href = URL.resolve(arena, page.base(), raw_href, .{ .encode = true }) catch raw_href; + const href = URL.resolve(arena, page.base(), raw_href, .{ .encoding = page.charset }) catch raw_href; if (std.ascii.eqlIgnoreCase(rel, "alternate")) { try alternate.append(arena, .{ diff --git a/src/browser/tests/page/encoding.html b/src/browser/tests/page/encoding.html index 19e0134f..b740a465 100644 --- a/src/browser/tests/page/encoding.html +++ b/src/browser/tests/page/encoding.html @@ -77,3 +77,32 @@ }); } + + diff --git a/src/browser/webapi/Node.zig b/src/browser/webapi/Node.zig index 0e7c2ffe..5871abee 100644 --- a/src/browser/webapi/Node.zig +++ b/src/browser/webapi/Node.zig @@ -22,6 +22,7 @@ const String = @import("../../string.zig").String; const js = @import("../js/js.zig"); const Page = @import("../Page.zig"); +const URL = @import("../URL.zig"); const reflect = @import("../reflect.zig"); const EventTarget = @import("EventTarget.zig"); @@ -511,6 +512,18 @@ pub fn ownerPage(self: *const Node, default: *Page) *Page { return doc._page orelse default; } +pub const ResolveURLOpts = struct { + allocator: ?Allocator = null, +}; + +// Resolve a URL relative to this node's owning document. +// Uses the document's charset for query string encoding (with NCR fallback for unmappable chars). +pub fn resolveURL(self: *const Node, url: anytype, page: *Page, opts: ResolveURLOpts) ![:0]const u8 { + const owner_page = self.ownerPage(page); + const allocator = opts.allocator orelse page.call_arena; + return URL.resolve(allocator, owner_page.base(), url, .{ .encoding = owner_page.charset }); +} + pub fn isSameDocumentAs(self: *const Node, other: *const Node, page: *const Page) bool { // Get the root document for each node const self_doc = if (self._type == .document) self._type.document else self.ownerDocument(page); diff --git a/src/browser/webapi/element/html/Anchor.zig b/src/browser/webapi/element/html/Anchor.zig index 33c8bded..e4207e84 100644 --- a/src/browser/webapi/element/html/Anchor.zig +++ b/src/browser/webapi/element/html/Anchor.zig @@ -39,12 +39,11 @@ pub fn asNode(self: *Anchor) *Node { } pub fn getHref(self: *Anchor, page: *Page) ![]const u8 { - const element = self.asElement(); - const href = element.getAttributeSafe(comptime .wrap("href")) orelse return ""; + const href = self.asElement().getAttributeSafe(comptime .wrap("href")) orelse return ""; if (href.len == 0) { return ""; } - return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true }); + return self.asNode().resolveURL(href, page, .{}); } pub fn setHref(self: *Anchor, value: []const u8, page: *Page) !void { @@ -203,7 +202,7 @@ fn getResolvedHref(self: *Anchor, page: *Page) !?[:0]const u8 { if (href.len == 0) { return null; } - return try URL.resolve(page.call_arena, page.base(), href, .{}); + return try self.asNode().resolveURL(href, page, .{}); } pub const JsApi = struct { diff --git a/src/browser/webapi/element/html/Form.zig b/src/browser/webapi/element/html/Form.zig index e8857e48..6628306b 100644 --- a/src/browser/webapi/element/html/Form.zig +++ b/src/browser/webapi/element/html/Form.zig @@ -97,7 +97,7 @@ pub fn getAction(self: *Form, page: *Page) ![]const u8 { if (action.len == 0) { return page.url; } - return URL.resolve(page.call_arena, page.base(), action, .{ .encode = true }); + return element.asNode().resolveURL(action, page, .{}); } pub fn setAction(self: *Form, value: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/IFrame.zig b/src/browser/webapi/element/html/IFrame.zig index e596f4ac..3b276dcd 100644 --- a/src/browser/webapi/element/html/IFrame.zig +++ b/src/browser/webapi/element/html/IFrame.zig @@ -48,9 +48,9 @@ pub fn getContentDocument(self: *const IFrame) ?*Document { return window._document; } -pub fn getSrc(self: *const IFrame, page: *Page) ![:0]const u8 { +pub fn getSrc(self: *IFrame, page: *Page) ![:0]const u8 { if (self._src.len == 0) return ""; - return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true }); + return self.asNode().resolveURL(self._src, page, .{}); } pub fn setSrc(self: *IFrame, src: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/Image.zig b/src/browser/webapi/element/html/Image.zig index b6731144..e3b57cd9 100644 --- a/src/browser/webapi/element/html/Image.zig +++ b/src/browser/webapi/element/html/Image.zig @@ -40,9 +40,7 @@ pub fn getSrc(self: *const Image, page: *Page) ![]const u8 { if (src.len == 0) { return ""; } - - // Always resolve the src against the page URL - return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true }); + return element.asConstNode().resolveURL(src, page, .{}); } pub fn setSrc(self: *Image, value: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/Link.zig b/src/browser/webapi/element/html/Link.zig index ed3839f2..5b6ce0c6 100644 --- a/src/browser/webapi/element/html/Link.zig +++ b/src/browser/webapi/element/html/Link.zig @@ -44,9 +44,7 @@ pub fn getHref(self: *Link, page: *Page) ![]const u8 { if (href.len == 0) { return ""; } - - // Always resolve the href against the page URL - return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true }); + return element.asNode().resolveURL(href, page, .{}); } pub fn setHref(self: *Link, value: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/Media.zig b/src/browser/webapi/element/html/Media.zig index 71013e71..6d62013f 100644 --- a/src/browser/webapi/element/html/Media.zig +++ b/src/browser/webapi/element/html/Media.zig @@ -235,8 +235,7 @@ pub fn getSrc(self: *const Media, page: *Page) ![]const u8 { if (src.len == 0) { return ""; } - const URL = @import("../../URL.zig"); - return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true }); + return element.asConstNode().resolveURL(src, page, .{}); } pub fn setSrc(self: *Media, value: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/Script.zig b/src/browser/webapi/element/html/Script.zig index d5e83b4f..77b6b7ef 100644 --- a/src/browser/webapi/element/html/Script.zig +++ b/src/browser/webapi/element/html/Script.zig @@ -45,9 +45,9 @@ pub fn asNode(self: *Script) *Node { return self.asElement().asNode(); } -pub fn getSrc(self: *const Script, page: *Page) ![]const u8 { +pub fn getSrc(self: *Script, page: *Page) ![]const u8 { if (self._src.len == 0) return ""; - return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true }); + return self.asNode().resolveURL(self._src, page, .{}); } pub fn setSrc(self: *Script, src: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/element/html/Video.zig b/src/browser/webapi/element/html/Video.zig index 63ccda4a..8fabb3ae 100644 --- a/src/browser/webapi/element/html/Video.zig +++ b/src/browser/webapi/element/html/Video.zig @@ -57,9 +57,7 @@ pub fn getPoster(self: *const Video, page: *Page) ![]const u8 { if (poster.len == 0) { return ""; } - - const URL = @import("../../URL.zig"); - return URL.resolve(page.call_arena, page.base(), poster, .{ .encode = true }); + return element.asConstNode().resolveURL(poster, page, .{}); } pub fn setPoster(self: *Video, value: []const u8, page: *Page) !void { diff --git a/src/browser/webapi/net/WebSocket.zig b/src/browser/webapi/net/WebSocket.zig index 1244a61e..c5228627 100644 --- a/src/browser/webapi/net/WebSocket.zig +++ b/src/browser/webapi/net/WebSocket.zig @@ -108,7 +108,7 @@ pub fn init(url: []const u8, protocols_: ?[]const u8, page: *Page) !*WebSocket { const arena = try page.getArena(.{ .debug = "WebSocket" }); errdefer page.releaseArena(arena); - const resolved_url = try URL.resolve(arena, page.base(), url, .{ .always_dupe = true, .encode = true }); + const resolved_url = try URL.resolve(arena, page.base(), url, .{ .always_dupe = true, .encoding = page.charset }); const http_client = page._session.browser.http_client; const conn = http_client.network.newConnection() orelse { diff --git a/src/browser/webapi/net/XMLHttpRequest.zig b/src/browser/webapi/net/XMLHttpRequest.zig index 62e05a17..8a56d370 100644 --- a/src/browser/webapi/net/XMLHttpRequest.zig +++ b/src/browser/webapi/net/XMLHttpRequest.zig @@ -210,7 +210,7 @@ pub fn open(self: *XMLHttpRequest, method_: []const u8, url: [:0]const u8) !void const page = self._page; self._method = try parseMethod(method_); - self._url = try URL.resolve(self._arena, page.base(), url, .{ .always_dupe = true, .encode = true }); + self._url = try URL.resolve(self._arena, page.base(), url, .{ .always_dupe = true, .encoding = page.charset }); try self.stateChanged(.opened, page); } diff --git a/src/cdp/domains/page.zig b/src/cdp/domains/page.zig index 267cada8..beb86c6b 100644 --- a/src/cdp/domains/page.zig +++ b/src/cdp/domains/page.zig @@ -286,7 +286,7 @@ fn navigate(cmd: *CDP.Command) !void { page = try session.replacePage(); } - const encoded_url = try URL.ensureEncoded(page.call_arena, params.url); + const encoded_url = try URL.ensureEncoded(page.call_arena, params.url, "UTF-8"); try page.navigate(encoded_url, .{ .reason = .address_bar, .cdp_id = cmd.input.id, diff --git a/src/cdp/domains/target.zig b/src/cdp/domains/target.zig index bce7e00d..822659f7 100644 --- a/src/cdp/domains/target.zig +++ b/src/cdp/domains/target.zig @@ -220,7 +220,7 @@ fn createTarget(cmd: *CDP.Command) !void { } if (!std.mem.eql(u8, "about:blank", params.url)) { - const encoded_url = try URL.ensureEncoded(page.call_arena, params.url); + const encoded_url = try URL.ensureEncoded(page.call_arena, params.url, "UTF-8"); try page.navigate( encoded_url, .{ .reason = .address_bar, .kind = .{ .push = null } }, diff --git a/src/html5ever/lib.rs b/src/html5ever/lib.rs index c684c039..9d14e784 100644 --- a/src/html5ever/lib.rs +++ b/src/html5ever/lib.rs @@ -334,6 +334,120 @@ pub extern "C" fn encoding_decoder_free(decoder_ptr: *mut c_void) { } } +// === Encoding API (UTF-8 to legacy encoding with NCR fallback) === + +/// Result of encoding operation +#[repr(C)] +pub struct EncodeResult { + /// 0 = success, 1 = output buffer too small + pub status: u8, + /// Number of input bytes consumed + pub bytes_read: usize, + /// Number of bytes written to output buffer + pub bytes_written: usize, +} + +/// Encode UTF-8 to a legacy encoding, replacing unencodable characters with +/// HTML decimal numeric character references (&#codepoint;). +/// +/// This is used for URL query string encoding per WHATWG URL spec. +/// encoding_rs's encode_from_utf8 already produces NCRs for unmappable chars. +#[no_mangle] +pub extern "C" fn encoding_encode_with_ncr( + handle: *const c_void, + input: *const c_uchar, + input_len: usize, + output: *mut c_uchar, + output_capacity: usize, +) -> EncodeResult { + if handle.is_null() || output.is_null() { + return EncodeResult { + status: 1, + bytes_read: 0, + bytes_written: 0, + }; + } + + let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; + + let input_str = if input.is_null() || input_len == 0 { + "" + } else { + let bytes = unsafe { std::slice::from_raw_parts(input, input_len) }; + match std::str::from_utf8(bytes) { + Ok(s) => s, + Err(_) => { + return EncodeResult { + status: 1, + bytes_read: 0, + bytes_written: 0, + }; + } + } + }; + + // For UTF-8 encoding, just copy directly (no NCR needed) + if encoding == encoding_rs::UTF_8 { + if input_len > output_capacity { + return EncodeResult { + bytes_read: 0, + bytes_written: 0, + status: 1, + }; + } + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) }; + output_slice[..input_len].copy_from_slice(input_str.as_bytes()); + return EncodeResult { + bytes_read: input_len, + bytes_written: input_len, + status: 0, + }; + } + + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) }; + let mut encoder = encoding.new_encoder(); + + // encode_from_utf8 automatically produces NCRs for unmappable characters + let (result, bytes_read, bytes_written, _had_unmappables) = + encoder.encode_from_utf8(input_str, output_slice, true); + + match result { + encoding_rs::CoderResult::InputEmpty => EncodeResult { + bytes_read, + bytes_written, + status: 0, + }, + encoding_rs::CoderResult::OutputFull => EncodeResult { + bytes_read, + bytes_written, + status: 1, + }, + } +} + +/// Calculate maximum output buffer size needed for encoding with NCR fallback. +/// Worst case: every character becomes &#codepoint; where codepoint is up to 7 digits. +#[no_mangle] +pub extern "C" fn encoding_max_encode_buffer_length( + handle: *const c_void, + input_len: usize, +) -> usize { + if handle.is_null() { + return 0; + } + let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; + let encoder = encoding.new_encoder(); + // This returns the max buffer size accounting for NCR expansion + encoder + .max_buffer_length_from_utf8_if_no_unmappables(input_len) + .map(|len| { + // Add extra space for potential NCRs (each char could become &#nnnnnn; = 10 bytes) + // But realistically, most chars are mappable, so add 2x as safety margin + len.saturating_mul(2) + }) + .unwrap_or(input_len * 10) +} + #[no_mangle] pub extern "C" fn html5ever_parse_fragment( html: *mut c_uchar, diff --git a/src/lightpanda.zig b/src/lightpanda.zig index 4d6c23fb..b0356e93 100644 --- a/src/lightpanda.zig +++ b/src/lightpanda.zig @@ -107,7 +107,7 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void { // } // } - const encoded_url = try URL.ensureEncoded(page.call_arena, url); + const encoded_url = try URL.ensureEncoded(page.call_arena, url, "UTF-8"); _ = try page.navigate(encoded_url, .{ .reason = .address_bar, .kind = .{ .push = null }, From a5bf1f07afdb6b1d2bcff15702022d0a297144e4 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Fri, 10 Apr 2026 15:09:32 +0800 Subject: [PATCH 4/5] chore: trigger CI From 7c6624014683e845708db81b45e77b2930e6cb56 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Fri, 10 Apr 2026 15:41:38 +0800 Subject: [PATCH 5/5] chore: trigger CI