mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-06-11 17:46:32 -04:00
Merge pull request #2102 from lightpanda-io/non-utf8-encoding
Use encoding_rs on non-UTF-8 html to convert to utf-8
This commit is contained in:
@@ -131,6 +131,10 @@ fn getContentType(file_path: []const u8) []const u8 {
|
||||
return "application/json";
|
||||
}
|
||||
|
||||
if (std.mem.endsWith(u8, file_path, ".GB2312.html")) {
|
||||
return "text/html; charset=GB2312";
|
||||
}
|
||||
|
||||
if (std.mem.endsWith(u8, file_path, ".html")) {
|
||||
return "text/html";
|
||||
}
|
||||
|
||||
@@ -37,6 +37,7 @@ const ScriptManager = @import("ScriptManager.zig");
|
||||
const StyleManager = @import("StyleManager.zig");
|
||||
|
||||
const Parser = @import("parser/Parser.zig");
|
||||
const h5e = @import("parser/html5ever.zig");
|
||||
|
||||
const URL = @import("URL.zig");
|
||||
const Blob = @import("webapi/Blob.zig");
|
||||
@@ -960,7 +961,11 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
|
||||
}
|
||||
|
||||
switch (mime.content_type) {
|
||||
.text_html => self._parse_state = .{ .html = .{} },
|
||||
.text_html => {
|
||||
self._parse_state = .{ .html = .{
|
||||
.mime = mime,
|
||||
} };
|
||||
},
|
||||
.application_json, .text_javascript, .text_css, .text_plain => {
|
||||
var arr: std.ArrayList(u8) = .empty;
|
||||
try arr.appendSlice(self.arena, "<html><head><meta charset=\"utf-8\"></head><body><pre>");
|
||||
@@ -974,7 +979,7 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
|
||||
}
|
||||
|
||||
switch (self._parse_state) {
|
||||
.html => |*buf| try buf.appendSlice(self.arena, data),
|
||||
.html => |*html| try html.buf.appendSlice(self.arena, data),
|
||||
.text => |*buf| {
|
||||
// we have to escape the data...
|
||||
var v = data;
|
||||
@@ -1023,8 +1028,13 @@ fn pageDoneCallback(ctx: *anyopaque) !void {
|
||||
var parser = Parser.init(parse_arena, self.document.asNode(), self);
|
||||
|
||||
switch (self._parse_state) {
|
||||
.html => |buf| {
|
||||
parser.parse(buf.items);
|
||||
.html => |*html_state| {
|
||||
const raw_html = html_state.buf.items;
|
||||
if (html_state.needsEncodingConversion()) {
|
||||
parser.parseWithEncoding(raw_html, html_state.mime.charsetString());
|
||||
} else {
|
||||
parser.parse(raw_html);
|
||||
}
|
||||
self._script_manager.staticScriptsDone();
|
||||
self._parse_state = .complete;
|
||||
},
|
||||
@@ -1091,7 +1101,6 @@ fn pageErrorCallback(ctx: *anyopaque, err: anyerror) void {
|
||||
return;
|
||||
};
|
||||
}
|
||||
|
||||
pub fn isGoingAway(self: *const Page) bool {
|
||||
if (self._queued_navigation != null) {
|
||||
return true;
|
||||
@@ -3155,11 +3164,21 @@ const ParseState = union(enum) {
|
||||
pre,
|
||||
complete,
|
||||
err: anyerror,
|
||||
html: std.ArrayList(u8),
|
||||
html: Html,
|
||||
text: std.ArrayList(u8),
|
||||
image: std.ArrayList(u8),
|
||||
raw: std.ArrayList(u8),
|
||||
raw_done: []const u8,
|
||||
|
||||
const Html = struct {
|
||||
mime: Mime,
|
||||
buf: std.ArrayList(u8) = .empty,
|
||||
|
||||
fn needsEncodingConversion(self: *const Html) bool {
|
||||
const charset = self.mime.charsetString();
|
||||
return !std.ascii.eqlIgnoreCase(charset, "utf-8") and !std.ascii.eqlIgnoreCase(charset, "utf8");
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
const LoadState = enum {
|
||||
|
||||
@@ -103,6 +103,32 @@ pub fn parse(self: *Parser, html: []const u8) void {
|
||||
);
|
||||
}
|
||||
|
||||
/// Parse HTML with encoding conversion. Converts from charset to UTF-8 before parsing.
|
||||
pub fn parseWithEncoding(self: *Parser, html: []const u8, charset: []const u8) void {
|
||||
h5e.html5ever_parse_document_with_encoding(
|
||||
html.ptr,
|
||||
html.len,
|
||||
charset.ptr,
|
||||
charset.len,
|
||||
&self.container,
|
||||
self,
|
||||
createElementCallback,
|
||||
getDataCallback,
|
||||
appendCallback,
|
||||
parseErrorCallback,
|
||||
popCallback,
|
||||
createCommentCallback,
|
||||
createProcessingInstruction,
|
||||
appendDoctypeToDocument,
|
||||
addAttrsIfMissingCallback,
|
||||
getTemplateContentsCallback,
|
||||
removeFromParentCallback,
|
||||
reparentChildrenCallback,
|
||||
appendBeforeSiblingCallback,
|
||||
appendBasedOnParentNodeCallback,
|
||||
);
|
||||
}
|
||||
|
||||
pub fn parseXML(self: *Parser, xml: []const u8) void {
|
||||
h5e.xml5ever_parse_document(
|
||||
xml.ptr,
|
||||
|
||||
@@ -39,6 +39,30 @@ pub extern "c" fn html5ever_parse_document(
|
||||
appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
|
||||
) void;
|
||||
|
||||
/// Parse HTML document with encoding conversion. Converts from charset to UTF-8 before parsing.
|
||||
pub extern "c" fn html5ever_parse_document_with_encoding(
|
||||
html: [*c]const u8,
|
||||
len: usize,
|
||||
charset: [*c]const u8,
|
||||
charset_len: usize,
|
||||
doc: *anyopaque,
|
||||
ctx: *anyopaque,
|
||||
createElementCallback: *const fn (ctx: *anyopaque, data: *anyopaque, QualName, AttributeIterator) callconv(.c) ?*anyopaque,
|
||||
elemNameCallback: *const fn (node_ref: *anyopaque) callconv(.c) *anyopaque,
|
||||
appendCallback: *const fn (ctx: *anyopaque, parent_ref: *anyopaque, NodeOrText) callconv(.c) void,
|
||||
parseErrorCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) void,
|
||||
popCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque) callconv(.c) void,
|
||||
createCommentCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) ?*anyopaque,
|
||||
createProcessingInstruction: *const fn (ctx: *anyopaque, StringSlice, StringSlice) callconv(.c) ?*anyopaque,
|
||||
appendDoctypeToDocument: *const fn (ctx: *anyopaque, StringSlice, StringSlice, StringSlice) callconv(.c) void,
|
||||
addAttrsIfMissingCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque, AttributeIterator) callconv(.c) void,
|
||||
getTemplateContentsCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) ?*anyopaque,
|
||||
removeFromParentCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) void,
|
||||
reparentChildrenCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque, new_parent_ref: *anyopaque) callconv(.c) void,
|
||||
appendBeforeSiblingCallback: *const fn (ctx: *anyopaque, sibling_ref: *anyopaque, NodeOrText) callconv(.c) void,
|
||||
appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
|
||||
) void;
|
||||
|
||||
pub extern "c" fn html5ever_parse_fragment(
|
||||
html: [*c]const u8,
|
||||
len: usize,
|
||||
|
||||
75
src/browser/tests/page/encoding.html
Normal file
75
src/browser/tests/page/encoding.html
Normal file
@@ -0,0 +1,75 @@
|
||||
<!DOCTYPE html>
|
||||
<body></body>
|
||||
<script src="../testing.js"></script>
|
||||
|
||||
<script id="gbk_encoding">
|
||||
{
|
||||
const iframe = document.createElement('iframe');
|
||||
document.body.appendChild(iframe);
|
||||
iframe.src = 'encoding/gbk.html';
|
||||
|
||||
testing.onload(() => {
|
||||
// GBK-encoded "中文" should be decoded to UTF-8
|
||||
testing.expectEqual('中文', iframe.contentDocument.getElementById('test').textContent);
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<script id="shift_jis_encoding">
|
||||
{
|
||||
const iframe = document.createElement('iframe');
|
||||
document.body.appendChild(iframe);
|
||||
iframe.src = 'encoding/shift_jis.html';
|
||||
|
||||
testing.onload(() => {
|
||||
// Shift_JIS-encoded "日本語" should be decoded to UTF-8
|
||||
testing.expectEqual('日本語', iframe.contentDocument.getElementById('test').textContent);
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<script id="latin1_encoding">
|
||||
{
|
||||
const iframe = document.createElement('iframe');
|
||||
document.body.appendChild(iframe);
|
||||
iframe.src = 'encoding/latin1.html';
|
||||
|
||||
testing.onload(() => {
|
||||
// ISO-8859-1-encoded "Café" should be decoded to UTF-8
|
||||
testing.expectEqual('Café', iframe.contentDocument.getElementById('test').textContent);
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<script id="content_type_header_charset">
|
||||
{
|
||||
// Test charset from Content-Type HTTP header (no meta charset in file)
|
||||
// TestHTTPServer returns "text/html; charset=GB2312" for *.GB2312.html files
|
||||
const iframe = document.createElement('iframe');
|
||||
document.body.appendChild(iframe);
|
||||
iframe.src = testing.BASE_URL + 'page/encoding/content_type.GB2312.html';
|
||||
|
||||
testing.onload(() => {
|
||||
// GB2312-encoded "中文" should be decoded to UTF-8 via Content-Type header charset
|
||||
testing.expectEqual('中文', iframe.contentDocument.getElementById('test').textContent);
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<script id="no_charset_fallback">
|
||||
{
|
||||
// Test file with non-UTF-8 bytes but NO charset declaration anywhere.
|
||||
// Without charset info, the bytes are parsed as UTF-8, producing replacement characters.
|
||||
// This documents the "broken" behavior for files without proper encoding declaration.
|
||||
const iframe = document.createElement('iframe');
|
||||
document.body.appendChild(iframe);
|
||||
iframe.src = 'encoding/no_charset.html';
|
||||
|
||||
testing.onload(() => {
|
||||
// The GBK bytes D6 D0 CE C4 are invalid UTF-8, each becomes U+FFFD
|
||||
const text = iframe.contentDocument.getElementById('test').textContent;
|
||||
// Should contain replacement characters (the exact count depends on how invalid bytes are handled)
|
||||
testing.expectTrue(text.includes('\uFFFD'));
|
||||
});
|
||||
}
|
||||
</script>
|
||||
4
src/browser/tests/page/encoding/content_type.GB2312.html
Normal file
4
src/browser/tests/page/encoding/content_type.GB2312.html
Normal file
@@ -0,0 +1,4 @@
|
||||
<html>
|
||||
<head></head>
|
||||
<body><div id="test"><EFBFBD><EFBFBD><EFBFBD><EFBFBD></div></body>
|
||||
</html>
|
||||
4
src/browser/tests/page/encoding/gbk.html
Normal file
4
src/browser/tests/page/encoding/gbk.html
Normal file
@@ -0,0 +1,4 @@
|
||||
<html>
|
||||
<head><meta charset="gbk"></head>
|
||||
<body><div id="test"><EFBFBD><EFBFBD><EFBFBD><EFBFBD></div></body>
|
||||
</html>
|
||||
4
src/browser/tests/page/encoding/latin1.html
Normal file
4
src/browser/tests/page/encoding/latin1.html
Normal file
@@ -0,0 +1,4 @@
|
||||
<html>
|
||||
<head><meta charset="iso-8859-1"></head>
|
||||
<body><div id="test">Caf<EFBFBD></div></body>
|
||||
</html>
|
||||
4
src/browser/tests/page/encoding/no_charset.html
Normal file
4
src/browser/tests/page/encoding/no_charset.html
Normal file
@@ -0,0 +1,4 @@
|
||||
<html>
|
||||
<head></head>
|
||||
<body><div id="test"><EFBFBD><EFBFBD><EFBFBD><EFBFBD></div></body>
|
||||
</html>
|
||||
4
src/browser/tests/page/encoding/shift_jis.html
Normal file
4
src/browser/tests/page/encoding/shift_jis.html
Normal file
@@ -0,0 +1,4 @@
|
||||
<html>
|
||||
<head><meta charset="shift_jis"></head>
|
||||
<body><div id="test"><EFBFBD><EFBFBD><EFBFBD>{<7B><></div></body>
|
||||
</html>
|
||||
10
src/html5ever/Cargo.lock
generated
10
src/html5ever/Cargo.lock
generated
@@ -30,6 +30,15 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
version = "0.1.2"
|
||||
@@ -67,6 +76,7 @@ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
|
||||
name = "litefetch-html5ever"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
"html5ever",
|
||||
"string_cache 0.9.0",
|
||||
"tikv-jemalloc-ctl",
|
||||
|
||||
@@ -15,6 +15,7 @@ typed-arena = "2.0.2"
|
||||
tikv-jemallocator = {version = "0.6.0", features = ["stats"]}
|
||||
tikv-jemalloc-ctl = {version = "0.6.0", features = ["stats"]}
|
||||
xml5ever = "0.35.0"
|
||||
encoding_rs = "0.8"
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
||||
|
||||
@@ -27,6 +27,7 @@ use std::cell::Cell;
|
||||
use std::os::raw::{c_uchar, c_void};
|
||||
use types::*;
|
||||
|
||||
use encoding_rs::Encoding;
|
||||
use html5ever::interface::tree_builder::QuirksMode;
|
||||
use html5ever::tendril::{StrTendril, TendrilSink};
|
||||
use html5ever::{ns, parse_document, parse_fragment, LocalName, ParseOpts, Parser, QualName};
|
||||
@@ -85,6 +86,71 @@ pub extern "C" fn html5ever_parse_document(
|
||||
.one(bytes);
|
||||
}
|
||||
|
||||
/// Parse an HTML document with encoding conversion.
|
||||
/// If charset is provided, converts from that encoding to UTF-8 before parsing.
|
||||
/// Uses Cow<str> internally so no allocation if content is already valid UTF-8.
|
||||
#[no_mangle]
|
||||
pub extern "C" fn html5ever_parse_document_with_encoding(
|
||||
html: *mut c_uchar,
|
||||
len: usize,
|
||||
charset: *const c_uchar,
|
||||
charset_len: usize,
|
||||
document: Ref,
|
||||
ctx: Ref,
|
||||
create_element_callback: CreateElementCallback,
|
||||
get_data_callback: GetDataCallback,
|
||||
append_callback: AppendCallback,
|
||||
parse_error_callback: ParseErrorCallback,
|
||||
pop_callback: PopCallback,
|
||||
create_comment_callback: CreateCommentCallback,
|
||||
create_processing_instruction: CreateProcessingInstruction,
|
||||
append_doctype_to_document: AppendDoctypeToDocumentCallback,
|
||||
add_attrs_if_missing_callback: AddAttrsIfMissingCallback,
|
||||
get_template_contents_callback: GetTemplateContentsCallback,
|
||||
remove_from_parent_callback: RemoveFromParentCallback,
|
||||
reparent_children_callback: ReparentChildrenCallback,
|
||||
append_before_sibling_callback: AppendBeforeSiblingCallback,
|
||||
append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback,
|
||||
) -> () {
|
||||
if html.is_null() || len == 0 {
|
||||
return ();
|
||||
}
|
||||
|
||||
let input = unsafe { std::slice::from_raw_parts(html, len) };
|
||||
let charset_bytes = unsafe { std::slice::from_raw_parts(charset, charset_len) };
|
||||
|
||||
// Decode to UTF-8. Returns Cow<str> - no allocation if already valid UTF-8.
|
||||
let encoding = Encoding::for_label(charset_bytes).unwrap_or(encoding_rs::UTF_8);
|
||||
let (decoded, _, _) = encoding.decode(input);
|
||||
|
||||
let arena = typed_arena::Arena::new();
|
||||
|
||||
let sink = sink::Sink {
|
||||
ctx: ctx,
|
||||
arena: &arena,
|
||||
document: document,
|
||||
quirks_mode: Cell::new(QuirksMode::NoQuirks),
|
||||
pop_callback: pop_callback,
|
||||
append_callback: append_callback,
|
||||
get_data_callback: get_data_callback,
|
||||
parse_error_callback: parse_error_callback,
|
||||
create_element_callback: create_element_callback,
|
||||
create_comment_callback: create_comment_callback,
|
||||
create_processing_instruction: create_processing_instruction,
|
||||
append_doctype_to_document: append_doctype_to_document,
|
||||
add_attrs_if_missing_callback: add_attrs_if_missing_callback,
|
||||
get_template_contents_callback: get_template_contents_callback,
|
||||
remove_from_parent_callback: remove_from_parent_callback,
|
||||
reparent_children_callback: reparent_children_callback,
|
||||
append_before_sibling_callback: append_before_sibling_callback,
|
||||
append_based_on_parent_node_callback: append_based_on_parent_node_callback,
|
||||
};
|
||||
|
||||
// Parse directly from decoded string
|
||||
parse_document(sink, Default::default())
|
||||
.one(StrTendril::from(decoded.as_ref()));
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn html5ever_parse_fragment(
|
||||
html: *mut c_uchar,
|
||||
|
||||
Reference in New Issue
Block a user