From 763927c352c6d7ee1224c6faed34b1f282364408 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Wed, 8 Apr 2026 13:12:02 +0800 Subject: [PATCH] Use encoding_rs on non-UTF-8 html to convert to utf-8 Using our existing MIME type detection, this uses encoding_rs to convert non- UTF-8 content to UTF-8, which can then be passed to html5ever. Issue: https://github.com/lightpanda-io/browser/issues/2089 --- src/TestHTTPServer.zig | 4 + src/browser/Page.zig | 31 ++++++-- src/browser/parser/Parser.zig | 26 +++++++ src/browser/parser/html5ever.zig | 24 ++++++ src/browser/tests/page/encoding.html | 75 +++++++++++++++++++ .../page/encoding/content_type.GB2312.html | 4 + src/browser/tests/page/encoding/gbk.html | 4 + src/browser/tests/page/encoding/latin1.html | 4 + .../tests/page/encoding/no_charset.html | 4 + .../tests/page/encoding/shift_jis.html | 4 + src/html5ever/Cargo.lock | 10 +++ src/html5ever/Cargo.toml | 1 + src/html5ever/lib.rs | 66 ++++++++++++++++ 13 files changed, 251 insertions(+), 6 deletions(-) create mode 100644 src/browser/tests/page/encoding.html create mode 100644 src/browser/tests/page/encoding/content_type.GB2312.html create mode 100644 src/browser/tests/page/encoding/gbk.html create mode 100644 src/browser/tests/page/encoding/latin1.html create mode 100644 src/browser/tests/page/encoding/no_charset.html create mode 100644 src/browser/tests/page/encoding/shift_jis.html diff --git a/src/TestHTTPServer.zig b/src/TestHTTPServer.zig index 21d9fa78..44736db1 100644 --- a/src/TestHTTPServer.zig +++ b/src/TestHTTPServer.zig @@ -131,6 +131,10 @@ fn getContentType(file_path: []const u8) []const u8 { return "application/json"; } + if (std.mem.endsWith(u8, file_path, ".GB2312.html")) { + return "text/html; charset=GB2312"; + } + if (std.mem.endsWith(u8, file_path, ".html")) { return "text/html"; } diff --git a/src/browser/Page.zig b/src/browser/Page.zig index 16a05806..7c60a573 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -37,6 +37,7 @@ const ScriptManager = @import("ScriptManager.zig"); const StyleManager = @import("StyleManager.zig"); const Parser = @import("parser/Parser.zig"); +const h5e = @import("parser/html5ever.zig"); const URL = @import("URL.zig"); const Blob = @import("webapi/Blob.zig"); @@ -960,7 +961,11 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void { } switch (mime.content_type) { - .text_html => self._parse_state = .{ .html = .{} }, + .text_html => { + self._parse_state = .{ .html = .{ + .mime = mime, + } }; + }, .application_json, .text_javascript, .text_css, .text_plain => { var arr: std.ArrayList(u8) = .empty; try arr.appendSlice(self.arena, "
");
@@ -974,7 +979,7 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
     }
 
     switch (self._parse_state) {
-        .html => |*buf| try buf.appendSlice(self.arena, data),
+        .html => |*html| try html.buf.appendSlice(self.arena, data),
         .text => |*buf| {
             // we have to escape the data...
             var v = data;
@@ -1023,8 +1028,13 @@ fn pageDoneCallback(ctx: *anyopaque) !void {
     var parser = Parser.init(parse_arena, self.document.asNode(), self);
 
     switch (self._parse_state) {
-        .html => |buf| {
-            parser.parse(buf.items);
+        .html => |*html_state| {
+            const raw_html = html_state.buf.items;
+            if (html_state.needsEncodingConversion()) {
+                parser.parseWithEncoding(raw_html, html_state.mime.charsetString());
+            } else {
+                parser.parse(raw_html);
+            }
             self._script_manager.staticScriptsDone();
             self._parse_state = .complete;
         },
@@ -1091,7 +1101,6 @@ fn pageErrorCallback(ctx: *anyopaque, err: anyerror) void {
         return;
     };
 }
-
 pub fn isGoingAway(self: *const Page) bool {
     if (self._queued_navigation != null) {
         return true;
@@ -3155,11 +3164,21 @@ const ParseState = union(enum) {
     pre,
     complete,
     err: anyerror,
-    html: std.ArrayList(u8),
+    html: Html,
     text: std.ArrayList(u8),
     image: std.ArrayList(u8),
     raw: std.ArrayList(u8),
     raw_done: []const u8,
+
+    const Html = struct {
+        mime: Mime,
+        buf: std.ArrayList(u8) = .empty,
+
+        fn needsEncodingConversion(self: *const Html) bool {
+            const charset = self.mime.charsetString();
+            return !std.ascii.eqlIgnoreCase(charset, "utf-8") and !std.ascii.eqlIgnoreCase(charset, "utf8");
+        }
+    };
 };
 
 const LoadState = enum {
diff --git a/src/browser/parser/Parser.zig b/src/browser/parser/Parser.zig
index f259bdd5..0c06cbcc 100644
--- a/src/browser/parser/Parser.zig
+++ b/src/browser/parser/Parser.zig
@@ -103,6 +103,32 @@ pub fn parse(self: *Parser, html: []const u8) void {
     );
 }
 
+/// Parse HTML with encoding conversion. Converts from charset to UTF-8 before parsing.
+pub fn parseWithEncoding(self: *Parser, html: []const u8, charset: []const u8) void {
+    h5e.html5ever_parse_document_with_encoding(
+        html.ptr,
+        html.len,
+        charset.ptr,
+        charset.len,
+        &self.container,
+        self,
+        createElementCallback,
+        getDataCallback,
+        appendCallback,
+        parseErrorCallback,
+        popCallback,
+        createCommentCallback,
+        createProcessingInstruction,
+        appendDoctypeToDocument,
+        addAttrsIfMissingCallback,
+        getTemplateContentsCallback,
+        removeFromParentCallback,
+        reparentChildrenCallback,
+        appendBeforeSiblingCallback,
+        appendBasedOnParentNodeCallback,
+    );
+}
+
 pub fn parseXML(self: *Parser, xml: []const u8) void {
     h5e.xml5ever_parse_document(
         xml.ptr,
diff --git a/src/browser/parser/html5ever.zig b/src/browser/parser/html5ever.zig
index 8ee873e2..f6f81583 100644
--- a/src/browser/parser/html5ever.zig
+++ b/src/browser/parser/html5ever.zig
@@ -39,6 +39,30 @@ pub extern "c" fn html5ever_parse_document(
     appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
 ) void;
 
+/// Parse HTML document with encoding conversion. Converts from charset to UTF-8 before parsing.
+pub extern "c" fn html5ever_parse_document_with_encoding(
+    html: [*c]const u8,
+    len: usize,
+    charset: [*c]const u8,
+    charset_len: usize,
+    doc: *anyopaque,
+    ctx: *anyopaque,
+    createElementCallback: *const fn (ctx: *anyopaque, data: *anyopaque, QualName, AttributeIterator) callconv(.c) ?*anyopaque,
+    elemNameCallback: *const fn (node_ref: *anyopaque) callconv(.c) *anyopaque,
+    appendCallback: *const fn (ctx: *anyopaque, parent_ref: *anyopaque, NodeOrText) callconv(.c) void,
+    parseErrorCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) void,
+    popCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque) callconv(.c) void,
+    createCommentCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) ?*anyopaque,
+    createProcessingInstruction: *const fn (ctx: *anyopaque, StringSlice, StringSlice) callconv(.c) ?*anyopaque,
+    appendDoctypeToDocument: *const fn (ctx: *anyopaque, StringSlice, StringSlice, StringSlice) callconv(.c) void,
+    addAttrsIfMissingCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque, AttributeIterator) callconv(.c) void,
+    getTemplateContentsCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) ?*anyopaque,
+    removeFromParentCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) void,
+    reparentChildrenCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque, new_parent_ref: *anyopaque) callconv(.c) void,
+    appendBeforeSiblingCallback: *const fn (ctx: *anyopaque, sibling_ref: *anyopaque, NodeOrText) callconv(.c) void,
+    appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
+) void;
+
 pub extern "c" fn html5ever_parse_fragment(
     html: [*c]const u8,
     len: usize,
diff --git a/src/browser/tests/page/encoding.html b/src/browser/tests/page/encoding.html
new file mode 100644
index 00000000..af532b82
--- /dev/null
+++ b/src/browser/tests/page/encoding.html
@@ -0,0 +1,75 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/browser/tests/page/encoding/content_type.GB2312.html b/src/browser/tests/page/encoding/content_type.GB2312.html
new file mode 100644
index 00000000..818566a8
--- /dev/null
+++ b/src/browser/tests/page/encoding/content_type.GB2312.html
@@ -0,0 +1,4 @@
+
+
+
ÖÐÎÄ
+ diff --git a/src/browser/tests/page/encoding/gbk.html b/src/browser/tests/page/encoding/gbk.html new file mode 100644 index 00000000..68a6106a --- /dev/null +++ b/src/browser/tests/page/encoding/gbk.html @@ -0,0 +1,4 @@ + + +
ÖÐÎÄ
+ diff --git a/src/browser/tests/page/encoding/latin1.html b/src/browser/tests/page/encoding/latin1.html new file mode 100644 index 00000000..be7d46ee --- /dev/null +++ b/src/browser/tests/page/encoding/latin1.html @@ -0,0 +1,4 @@ + + +
Café
+ diff --git a/src/browser/tests/page/encoding/no_charset.html b/src/browser/tests/page/encoding/no_charset.html new file mode 100644 index 00000000..818566a8 --- /dev/null +++ b/src/browser/tests/page/encoding/no_charset.html @@ -0,0 +1,4 @@ + + +
ÖÐÎÄ
+ diff --git a/src/browser/tests/page/encoding/shift_jis.html b/src/browser/tests/page/encoding/shift_jis.html new file mode 100644 index 00000000..2984bb6c --- /dev/null +++ b/src/browser/tests/page/encoding/shift_jis.html @@ -0,0 +1,4 @@ + + +
“ú–{Œê
+ diff --git a/src/html5ever/Cargo.lock b/src/html5ever/Cargo.lock index d94a7fd7..5faadb8e 100644 --- a/src/html5ever/Cargo.lock +++ b/src/html5ever/Cargo.lock @@ -30,6 +30,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "find-msvc-tools" version = "0.1.2" @@ -67,6 +76,7 @@ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" name = "litefetch-html5ever" version = "0.1.0" dependencies = [ + "encoding_rs", "html5ever", "string_cache 0.9.0", "tikv-jemalloc-ctl", diff --git a/src/html5ever/Cargo.toml b/src/html5ever/Cargo.toml index b4004404..b82ac73b 100644 --- a/src/html5ever/Cargo.toml +++ b/src/html5ever/Cargo.toml @@ -15,6 +15,7 @@ typed-arena = "2.0.2" tikv-jemallocator = {version = "0.6.0", features = ["stats"]} tikv-jemalloc-ctl = {version = "0.6.0", features = ["stats"]} xml5ever = "0.35.0" +encoding_rs = "0.8" [profile.release] lto = true diff --git a/src/html5ever/lib.rs b/src/html5ever/lib.rs index 29d62539..6fab9763 100644 --- a/src/html5ever/lib.rs +++ b/src/html5ever/lib.rs @@ -27,6 +27,7 @@ use std::cell::Cell; use std::os::raw::{c_uchar, c_void}; use types::*; +use encoding_rs::Encoding; use html5ever::interface::tree_builder::QuirksMode; use html5ever::tendril::{StrTendril, TendrilSink}; use html5ever::{ns, parse_document, parse_fragment, LocalName, ParseOpts, Parser, QualName}; @@ -85,6 +86,71 @@ pub extern "C" fn html5ever_parse_document( .one(bytes); } +/// Parse an HTML document with encoding conversion. +/// If charset is provided, converts from that encoding to UTF-8 before parsing. +/// Uses Cow internally so no allocation if content is already valid UTF-8. +#[no_mangle] +pub extern "C" fn html5ever_parse_document_with_encoding( + html: *mut c_uchar, + len: usize, + charset: *const c_uchar, + charset_len: usize, + document: Ref, + ctx: Ref, + create_element_callback: CreateElementCallback, + get_data_callback: GetDataCallback, + append_callback: AppendCallback, + parse_error_callback: ParseErrorCallback, + pop_callback: PopCallback, + create_comment_callback: CreateCommentCallback, + create_processing_instruction: CreateProcessingInstruction, + append_doctype_to_document: AppendDoctypeToDocumentCallback, + add_attrs_if_missing_callback: AddAttrsIfMissingCallback, + get_template_contents_callback: GetTemplateContentsCallback, + remove_from_parent_callback: RemoveFromParentCallback, + reparent_children_callback: ReparentChildrenCallback, + append_before_sibling_callback: AppendBeforeSiblingCallback, + append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback, +) -> () { + if html.is_null() || len == 0 { + return (); + } + + let input = unsafe { std::slice::from_raw_parts(html, len) }; + let charset_bytes = unsafe { std::slice::from_raw_parts(charset, charset_len) }; + + // Decode to UTF-8. Returns Cow - no allocation if already valid UTF-8. + let encoding = Encoding::for_label(charset_bytes).unwrap_or(encoding_rs::UTF_8); + let (decoded, _, _) = encoding.decode(input); + + let arena = typed_arena::Arena::new(); + + let sink = sink::Sink { + ctx: ctx, + arena: &arena, + document: document, + quirks_mode: Cell::new(QuirksMode::NoQuirks), + pop_callback: pop_callback, + append_callback: append_callback, + get_data_callback: get_data_callback, + parse_error_callback: parse_error_callback, + create_element_callback: create_element_callback, + create_comment_callback: create_comment_callback, + create_processing_instruction: create_processing_instruction, + append_doctype_to_document: append_doctype_to_document, + add_attrs_if_missing_callback: add_attrs_if_missing_callback, + get_template_contents_callback: get_template_contents_callback, + remove_from_parent_callback: remove_from_parent_callback, + reparent_children_callback: reparent_children_callback, + append_before_sibling_callback: append_before_sibling_callback, + append_based_on_parent_node_callback: append_based_on_parent_node_callback, + }; + + // Parse directly from decoded string + parse_document(sink, Default::default()) + .one(StrTendril::from(decoded.as_ref())); +} + #[no_mangle] pub extern "C" fn html5ever_parse_fragment( html: *mut c_uchar,