Merge pull request #2102 from lightpanda-io/non-utf8-encoding

Use encoding_rs on non-UTF-8 html to convert to utf-8
This commit is contained in:
Karl Seguin
2026-04-09 07:25:48 +08:00
committed by GitHub
13 changed files with 251 additions and 6 deletions

View File

@@ -131,6 +131,10 @@ fn getContentType(file_path: []const u8) []const u8 {
return "application/json";
}
if (std.mem.endsWith(u8, file_path, ".GB2312.html")) {
return "text/html; charset=GB2312";
}
if (std.mem.endsWith(u8, file_path, ".html")) {
return "text/html";
}

View File

@@ -37,6 +37,7 @@ const ScriptManager = @import("ScriptManager.zig");
const StyleManager = @import("StyleManager.zig");
const Parser = @import("parser/Parser.zig");
const h5e = @import("parser/html5ever.zig");
const URL = @import("URL.zig");
const Blob = @import("webapi/Blob.zig");
@@ -960,7 +961,11 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
}
switch (mime.content_type) {
.text_html => self._parse_state = .{ .html = .{} },
.text_html => {
self._parse_state = .{ .html = .{
.mime = mime,
} };
},
.application_json, .text_javascript, .text_css, .text_plain => {
var arr: std.ArrayList(u8) = .empty;
try arr.appendSlice(self.arena, "<html><head><meta charset=\"utf-8\"></head><body><pre>");
@@ -974,7 +979,7 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
}
switch (self._parse_state) {
.html => |*buf| try buf.appendSlice(self.arena, data),
.html => |*html| try html.buf.appendSlice(self.arena, data),
.text => |*buf| {
// we have to escape the data...
var v = data;
@@ -1023,8 +1028,13 @@ fn pageDoneCallback(ctx: *anyopaque) !void {
var parser = Parser.init(parse_arena, self.document.asNode(), self);
switch (self._parse_state) {
.html => |buf| {
parser.parse(buf.items);
.html => |*html_state| {
const raw_html = html_state.buf.items;
if (html_state.needsEncodingConversion()) {
parser.parseWithEncoding(raw_html, html_state.mime.charsetString());
} else {
parser.parse(raw_html);
}
self._script_manager.staticScriptsDone();
self._parse_state = .complete;
},
@@ -1091,7 +1101,6 @@ fn pageErrorCallback(ctx: *anyopaque, err: anyerror) void {
return;
};
}
pub fn isGoingAway(self: *const Page) bool {
if (self._queued_navigation != null) {
return true;
@@ -3155,11 +3164,21 @@ const ParseState = union(enum) {
pre,
complete,
err: anyerror,
html: std.ArrayList(u8),
html: Html,
text: std.ArrayList(u8),
image: std.ArrayList(u8),
raw: std.ArrayList(u8),
raw_done: []const u8,
const Html = struct {
mime: Mime,
buf: std.ArrayList(u8) = .empty,
fn needsEncodingConversion(self: *const Html) bool {
const charset = self.mime.charsetString();
return !std.ascii.eqlIgnoreCase(charset, "utf-8") and !std.ascii.eqlIgnoreCase(charset, "utf8");
}
};
};
const LoadState = enum {

View File

@@ -103,6 +103,32 @@ pub fn parse(self: *Parser, html: []const u8) void {
);
}
/// Parse HTML with encoding conversion. Converts from charset to UTF-8 before parsing.
pub fn parseWithEncoding(self: *Parser, html: []const u8, charset: []const u8) void {
h5e.html5ever_parse_document_with_encoding(
html.ptr,
html.len,
charset.ptr,
charset.len,
&self.container,
self,
createElementCallback,
getDataCallback,
appendCallback,
parseErrorCallback,
popCallback,
createCommentCallback,
createProcessingInstruction,
appendDoctypeToDocument,
addAttrsIfMissingCallback,
getTemplateContentsCallback,
removeFromParentCallback,
reparentChildrenCallback,
appendBeforeSiblingCallback,
appendBasedOnParentNodeCallback,
);
}
pub fn parseXML(self: *Parser, xml: []const u8) void {
h5e.xml5ever_parse_document(
xml.ptr,

View File

@@ -39,6 +39,30 @@ pub extern "c" fn html5ever_parse_document(
appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
) void;
/// Parse HTML document with encoding conversion. Converts from charset to UTF-8 before parsing.
pub extern "c" fn html5ever_parse_document_with_encoding(
html: [*c]const u8,
len: usize,
charset: [*c]const u8,
charset_len: usize,
doc: *anyopaque,
ctx: *anyopaque,
createElementCallback: *const fn (ctx: *anyopaque, data: *anyopaque, QualName, AttributeIterator) callconv(.c) ?*anyopaque,
elemNameCallback: *const fn (node_ref: *anyopaque) callconv(.c) *anyopaque,
appendCallback: *const fn (ctx: *anyopaque, parent_ref: *anyopaque, NodeOrText) callconv(.c) void,
parseErrorCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) void,
popCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque) callconv(.c) void,
createCommentCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) ?*anyopaque,
createProcessingInstruction: *const fn (ctx: *anyopaque, StringSlice, StringSlice) callconv(.c) ?*anyopaque,
appendDoctypeToDocument: *const fn (ctx: *anyopaque, StringSlice, StringSlice, StringSlice) callconv(.c) void,
addAttrsIfMissingCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque, AttributeIterator) callconv(.c) void,
getTemplateContentsCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) ?*anyopaque,
removeFromParentCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) void,
reparentChildrenCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque, new_parent_ref: *anyopaque) callconv(.c) void,
appendBeforeSiblingCallback: *const fn (ctx: *anyopaque, sibling_ref: *anyopaque, NodeOrText) callconv(.c) void,
appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
) void;
pub extern "c" fn html5ever_parse_fragment(
html: [*c]const u8,
len: usize,

View File

@@ -0,0 +1,75 @@
<!DOCTYPE html>
<body></body>
<script src="../testing.js"></script>
<script id="gbk_encoding">
{
const iframe = document.createElement('iframe');
document.body.appendChild(iframe);
iframe.src = 'encoding/gbk.html';
testing.onload(() => {
// GBK-encoded "中文" should be decoded to UTF-8
testing.expectEqual('中文', iframe.contentDocument.getElementById('test').textContent);
});
}
</script>
<script id="shift_jis_encoding">
{
const iframe = document.createElement('iframe');
document.body.appendChild(iframe);
iframe.src = 'encoding/shift_jis.html';
testing.onload(() => {
// Shift_JIS-encoded "日本語" should be decoded to UTF-8
testing.expectEqual('日本語', iframe.contentDocument.getElementById('test').textContent);
});
}
</script>
<script id="latin1_encoding">
{
const iframe = document.createElement('iframe');
document.body.appendChild(iframe);
iframe.src = 'encoding/latin1.html';
testing.onload(() => {
// ISO-8859-1-encoded "Café" should be decoded to UTF-8
testing.expectEqual('Café', iframe.contentDocument.getElementById('test').textContent);
});
}
</script>
<script id="content_type_header_charset">
{
// Test charset from Content-Type HTTP header (no meta charset in file)
// TestHTTPServer returns "text/html; charset=GB2312" for *.GB2312.html files
const iframe = document.createElement('iframe');
document.body.appendChild(iframe);
iframe.src = testing.BASE_URL + 'page/encoding/content_type.GB2312.html';
testing.onload(() => {
// GB2312-encoded "中文" should be decoded to UTF-8 via Content-Type header charset
testing.expectEqual('中文', iframe.contentDocument.getElementById('test').textContent);
});
}
</script>
<script id="no_charset_fallback">
{
// Test file with non-UTF-8 bytes but NO charset declaration anywhere.
// Without charset info, the bytes are parsed as UTF-8, producing replacement characters.
// This documents the "broken" behavior for files without proper encoding declaration.
const iframe = document.createElement('iframe');
document.body.appendChild(iframe);
iframe.src = 'encoding/no_charset.html';
testing.onload(() => {
// The GBK bytes D6 D0 CE C4 are invalid UTF-8, each becomes U+FFFD
const text = iframe.contentDocument.getElementById('test').textContent;
// Should contain replacement characters (the exact count depends on how invalid bytes are handled)
testing.expectTrue(text.includes('\uFFFD'));
});
}
</script>

View File

@@ -0,0 +1,4 @@
<html>
<head></head>
<body><div id="test"><EFBFBD><EFBFBD><EFBFBD><EFBFBD></div></body>
</html>

View File

@@ -0,0 +1,4 @@
<html>
<head><meta charset="gbk"></head>
<body><div id="test"><EFBFBD><EFBFBD><EFBFBD><EFBFBD></div></body>
</html>

View File

@@ -0,0 +1,4 @@
<html>
<head><meta charset="iso-8859-1"></head>
<body><div id="test">Caf<EFBFBD></div></body>
</html>

View File

@@ -0,0 +1,4 @@
<html>
<head></head>
<body><div id="test"><EFBFBD><EFBFBD><EFBFBD><EFBFBD></div></body>
</html>

View File

@@ -0,0 +1,4 @@
<html>
<head><meta charset="shift_jis"></head>
<body><div id="test"><EFBFBD><EFBFBD><EFBFBD>{<7B><></div></body>
</html>

View File

@@ -30,6 +30,15 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "encoding_rs"
version = "0.8.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
dependencies = [
"cfg-if",
]
[[package]]
name = "find-msvc-tools"
version = "0.1.2"
@@ -67,6 +76,7 @@ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
name = "litefetch-html5ever"
version = "0.1.0"
dependencies = [
"encoding_rs",
"html5ever",
"string_cache 0.9.0",
"tikv-jemalloc-ctl",

View File

@@ -15,6 +15,7 @@ typed-arena = "2.0.2"
tikv-jemallocator = {version = "0.6.0", features = ["stats"]}
tikv-jemalloc-ctl = {version = "0.6.0", features = ["stats"]}
xml5ever = "0.35.0"
encoding_rs = "0.8"
[profile.release]
lto = true

View File

@@ -27,6 +27,7 @@ use std::cell::Cell;
use std::os::raw::{c_uchar, c_void};
use types::*;
use encoding_rs::Encoding;
use html5ever::interface::tree_builder::QuirksMode;
use html5ever::tendril::{StrTendril, TendrilSink};
use html5ever::{ns, parse_document, parse_fragment, LocalName, ParseOpts, Parser, QualName};
@@ -85,6 +86,71 @@ pub extern "C" fn html5ever_parse_document(
.one(bytes);
}
/// Parse an HTML document with encoding conversion.
/// If charset is provided, converts from that encoding to UTF-8 before parsing.
/// Uses Cow<str> internally so no allocation if content is already valid UTF-8.
#[no_mangle]
pub extern "C" fn html5ever_parse_document_with_encoding(
html: *mut c_uchar,
len: usize,
charset: *const c_uchar,
charset_len: usize,
document: Ref,
ctx: Ref,
create_element_callback: CreateElementCallback,
get_data_callback: GetDataCallback,
append_callback: AppendCallback,
parse_error_callback: ParseErrorCallback,
pop_callback: PopCallback,
create_comment_callback: CreateCommentCallback,
create_processing_instruction: CreateProcessingInstruction,
append_doctype_to_document: AppendDoctypeToDocumentCallback,
add_attrs_if_missing_callback: AddAttrsIfMissingCallback,
get_template_contents_callback: GetTemplateContentsCallback,
remove_from_parent_callback: RemoveFromParentCallback,
reparent_children_callback: ReparentChildrenCallback,
append_before_sibling_callback: AppendBeforeSiblingCallback,
append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback,
) -> () {
if html.is_null() || len == 0 {
return ();
}
let input = unsafe { std::slice::from_raw_parts(html, len) };
let charset_bytes = unsafe { std::slice::from_raw_parts(charset, charset_len) };
// Decode to UTF-8. Returns Cow<str> - no allocation if already valid UTF-8.
let encoding = Encoding::for_label(charset_bytes).unwrap_or(encoding_rs::UTF_8);
let (decoded, _, _) = encoding.decode(input);
let arena = typed_arena::Arena::new();
let sink = sink::Sink {
ctx: ctx,
arena: &arena,
document: document,
quirks_mode: Cell::new(QuirksMode::NoQuirks),
pop_callback: pop_callback,
append_callback: append_callback,
get_data_callback: get_data_callback,
parse_error_callback: parse_error_callback,
create_element_callback: create_element_callback,
create_comment_callback: create_comment_callback,
create_processing_instruction: create_processing_instruction,
append_doctype_to_document: append_doctype_to_document,
add_attrs_if_missing_callback: add_attrs_if_missing_callback,
get_template_contents_callback: get_template_contents_callback,
remove_from_parent_callback: remove_from_parent_callback,
reparent_children_callback: reparent_children_callback,
append_before_sibling_callback: append_before_sibling_callback,
append_based_on_parent_node_callback: append_based_on_parent_node_callback,
};
// Parse directly from decoded string
parse_document(sink, Default::default())
.one(StrTendril::from(decoded.as_ref()));
}
#[no_mangle]
pub extern "C" fn html5ever_parse_fragment(
html: *mut c_uchar,