diff --git a/src/TestHTTPServer.zig b/src/TestHTTPServer.zig index 21d9fa78..44736db1 100644 --- a/src/TestHTTPServer.zig +++ b/src/TestHTTPServer.zig @@ -131,6 +131,10 @@ fn getContentType(file_path: []const u8) []const u8 { return "application/json"; } + if (std.mem.endsWith(u8, file_path, ".GB2312.html")) { + return "text/html; charset=GB2312"; + } + if (std.mem.endsWith(u8, file_path, ".html")) { return "text/html"; } diff --git a/src/browser/Page.zig b/src/browser/Page.zig index 16a05806..7c60a573 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -37,6 +37,7 @@ const ScriptManager = @import("ScriptManager.zig"); const StyleManager = @import("StyleManager.zig"); const Parser = @import("parser/Parser.zig"); +const h5e = @import("parser/html5ever.zig"); const URL = @import("URL.zig"); const Blob = @import("webapi/Blob.zig"); @@ -960,7 +961,11 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void { } switch (mime.content_type) { - .text_html => self._parse_state = .{ .html = .{} }, + .text_html => { + self._parse_state = .{ .html = .{ + .mime = mime, + } }; + }, .application_json, .text_javascript, .text_css, .text_plain => { var arr: std.ArrayList(u8) = .empty; try arr.appendSlice(self.arena, "
");
@@ -974,7 +979,7 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
}
switch (self._parse_state) {
- .html => |*buf| try buf.appendSlice(self.arena, data),
+ .html => |*html| try html.buf.appendSlice(self.arena, data),
.text => |*buf| {
// we have to escape the data...
var v = data;
@@ -1023,8 +1028,13 @@ fn pageDoneCallback(ctx: *anyopaque) !void {
var parser = Parser.init(parse_arena, self.document.asNode(), self);
switch (self._parse_state) {
- .html => |buf| {
- parser.parse(buf.items);
+ .html => |*html_state| {
+ const raw_html = html_state.buf.items;
+ if (html_state.needsEncodingConversion()) {
+ parser.parseWithEncoding(raw_html, html_state.mime.charsetString());
+ } else {
+ parser.parse(raw_html);
+ }
self._script_manager.staticScriptsDone();
self._parse_state = .complete;
},
@@ -1091,7 +1101,6 @@ fn pageErrorCallback(ctx: *anyopaque, err: anyerror) void {
return;
};
}
-
pub fn isGoingAway(self: *const Page) bool {
if (self._queued_navigation != null) {
return true;
@@ -3155,11 +3164,21 @@ const ParseState = union(enum) {
pre,
complete,
err: anyerror,
- html: std.ArrayList(u8),
+ html: Html,
text: std.ArrayList(u8),
image: std.ArrayList(u8),
raw: std.ArrayList(u8),
raw_done: []const u8,
+
+ const Html = struct {
+ mime: Mime,
+ buf: std.ArrayList(u8) = .empty,
+
+ fn needsEncodingConversion(self: *const Html) bool {
+ const charset = self.mime.charsetString();
+ return !std.ascii.eqlIgnoreCase(charset, "utf-8") and !std.ascii.eqlIgnoreCase(charset, "utf8");
+ }
+ };
};
const LoadState = enum {
diff --git a/src/browser/parser/Parser.zig b/src/browser/parser/Parser.zig
index f259bdd5..0c06cbcc 100644
--- a/src/browser/parser/Parser.zig
+++ b/src/browser/parser/Parser.zig
@@ -103,6 +103,32 @@ pub fn parse(self: *Parser, html: []const u8) void {
);
}
+/// Parse HTML with encoding conversion. Converts from charset to UTF-8 before parsing.
+pub fn parseWithEncoding(self: *Parser, html: []const u8, charset: []const u8) void {
+ h5e.html5ever_parse_document_with_encoding(
+ html.ptr,
+ html.len,
+ charset.ptr,
+ charset.len,
+ &self.container,
+ self,
+ createElementCallback,
+ getDataCallback,
+ appendCallback,
+ parseErrorCallback,
+ popCallback,
+ createCommentCallback,
+ createProcessingInstruction,
+ appendDoctypeToDocument,
+ addAttrsIfMissingCallback,
+ getTemplateContentsCallback,
+ removeFromParentCallback,
+ reparentChildrenCallback,
+ appendBeforeSiblingCallback,
+ appendBasedOnParentNodeCallback,
+ );
+}
+
pub fn parseXML(self: *Parser, xml: []const u8) void {
h5e.xml5ever_parse_document(
xml.ptr,
diff --git a/src/browser/parser/html5ever.zig b/src/browser/parser/html5ever.zig
index 8ee873e2..f6f81583 100644
--- a/src/browser/parser/html5ever.zig
+++ b/src/browser/parser/html5ever.zig
@@ -39,6 +39,30 @@ pub extern "c" fn html5ever_parse_document(
appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
) void;
+/// Parse HTML document with encoding conversion. Converts from charset to UTF-8 before parsing.
+pub extern "c" fn html5ever_parse_document_with_encoding(
+ html: [*c]const u8,
+ len: usize,
+ charset: [*c]const u8,
+ charset_len: usize,
+ doc: *anyopaque,
+ ctx: *anyopaque,
+ createElementCallback: *const fn (ctx: *anyopaque, data: *anyopaque, QualName, AttributeIterator) callconv(.c) ?*anyopaque,
+ elemNameCallback: *const fn (node_ref: *anyopaque) callconv(.c) *anyopaque,
+ appendCallback: *const fn (ctx: *anyopaque, parent_ref: *anyopaque, NodeOrText) callconv(.c) void,
+ parseErrorCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) void,
+ popCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque) callconv(.c) void,
+ createCommentCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) ?*anyopaque,
+ createProcessingInstruction: *const fn (ctx: *anyopaque, StringSlice, StringSlice) callconv(.c) ?*anyopaque,
+ appendDoctypeToDocument: *const fn (ctx: *anyopaque, StringSlice, StringSlice, StringSlice) callconv(.c) void,
+ addAttrsIfMissingCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque, AttributeIterator) callconv(.c) void,
+ getTemplateContentsCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) ?*anyopaque,
+ removeFromParentCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) void,
+ reparentChildrenCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque, new_parent_ref: *anyopaque) callconv(.c) void,
+ appendBeforeSiblingCallback: *const fn (ctx: *anyopaque, sibling_ref: *anyopaque, NodeOrText) callconv(.c) void,
+ appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
+) void;
+
pub extern "c" fn html5ever_parse_fragment(
html: [*c]const u8,
len: usize,
diff --git a/src/browser/tests/page/encoding.html b/src/browser/tests/page/encoding.html
new file mode 100644
index 00000000..af532b82
--- /dev/null
+++ b/src/browser/tests/page/encoding.html
@@ -0,0 +1,75 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/browser/tests/page/encoding/content_type.GB2312.html b/src/browser/tests/page/encoding/content_type.GB2312.html
new file mode 100644
index 00000000..818566a8
--- /dev/null
+++ b/src/browser/tests/page/encoding/content_type.GB2312.html
@@ -0,0 +1,4 @@
+
+
+ÖÐÎÄ
+
diff --git a/src/browser/tests/page/encoding/gbk.html b/src/browser/tests/page/encoding/gbk.html
new file mode 100644
index 00000000..68a6106a
--- /dev/null
+++ b/src/browser/tests/page/encoding/gbk.html
@@ -0,0 +1,4 @@
+
+
+ÖÐÎÄ
+
diff --git a/src/browser/tests/page/encoding/latin1.html b/src/browser/tests/page/encoding/latin1.html
new file mode 100644
index 00000000..be7d46ee
--- /dev/null
+++ b/src/browser/tests/page/encoding/latin1.html
@@ -0,0 +1,4 @@
+
+
+Café
+
diff --git a/src/browser/tests/page/encoding/no_charset.html b/src/browser/tests/page/encoding/no_charset.html
new file mode 100644
index 00000000..818566a8
--- /dev/null
+++ b/src/browser/tests/page/encoding/no_charset.html
@@ -0,0 +1,4 @@
+
+
+ÖÐÎÄ
+
diff --git a/src/browser/tests/page/encoding/shift_jis.html b/src/browser/tests/page/encoding/shift_jis.html
new file mode 100644
index 00000000..2984bb6c
--- /dev/null
+++ b/src/browser/tests/page/encoding/shift_jis.html
@@ -0,0 +1,4 @@
+
+
+“ú–{Œê
+
diff --git a/src/html5ever/Cargo.lock b/src/html5ever/Cargo.lock
index d94a7fd7..5faadb8e 100644
--- a/src/html5ever/Cargo.lock
+++ b/src/html5ever/Cargo.lock
@@ -30,6 +30,15 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
[[package]]
name = "find-msvc-tools"
version = "0.1.2"
@@ -67,6 +76,7 @@ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
name = "litefetch-html5ever"
version = "0.1.0"
dependencies = [
+ "encoding_rs",
"html5ever",
"string_cache 0.9.0",
"tikv-jemalloc-ctl",
diff --git a/src/html5ever/Cargo.toml b/src/html5ever/Cargo.toml
index b4004404..b82ac73b 100644
--- a/src/html5ever/Cargo.toml
+++ b/src/html5ever/Cargo.toml
@@ -15,6 +15,7 @@ typed-arena = "2.0.2"
tikv-jemallocator = {version = "0.6.0", features = ["stats"]}
tikv-jemalloc-ctl = {version = "0.6.0", features = ["stats"]}
xml5ever = "0.35.0"
+encoding_rs = "0.8"
[profile.release]
lto = true
diff --git a/src/html5ever/lib.rs b/src/html5ever/lib.rs
index 29d62539..6fab9763 100644
--- a/src/html5ever/lib.rs
+++ b/src/html5ever/lib.rs
@@ -27,6 +27,7 @@ use std::cell::Cell;
use std::os::raw::{c_uchar, c_void};
use types::*;
+use encoding_rs::Encoding;
use html5ever::interface::tree_builder::QuirksMode;
use html5ever::tendril::{StrTendril, TendrilSink};
use html5ever::{ns, parse_document, parse_fragment, LocalName, ParseOpts, Parser, QualName};
@@ -85,6 +86,71 @@ pub extern "C" fn html5ever_parse_document(
.one(bytes);
}
+/// Parse an HTML document with encoding conversion.
+/// If charset is provided, converts from that encoding to UTF-8 before parsing.
+/// Uses Cow internally so no allocation if content is already valid UTF-8.
+#[no_mangle]
+pub extern "C" fn html5ever_parse_document_with_encoding(
+ html: *mut c_uchar,
+ len: usize,
+ charset: *const c_uchar,
+ charset_len: usize,
+ document: Ref,
+ ctx: Ref,
+ create_element_callback: CreateElementCallback,
+ get_data_callback: GetDataCallback,
+ append_callback: AppendCallback,
+ parse_error_callback: ParseErrorCallback,
+ pop_callback: PopCallback,
+ create_comment_callback: CreateCommentCallback,
+ create_processing_instruction: CreateProcessingInstruction,
+ append_doctype_to_document: AppendDoctypeToDocumentCallback,
+ add_attrs_if_missing_callback: AddAttrsIfMissingCallback,
+ get_template_contents_callback: GetTemplateContentsCallback,
+ remove_from_parent_callback: RemoveFromParentCallback,
+ reparent_children_callback: ReparentChildrenCallback,
+ append_before_sibling_callback: AppendBeforeSiblingCallback,
+ append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback,
+) -> () {
+ if html.is_null() || len == 0 {
+ return ();
+ }
+
+ let input = unsafe { std::slice::from_raw_parts(html, len) };
+ let charset_bytes = unsafe { std::slice::from_raw_parts(charset, charset_len) };
+
+ // Decode to UTF-8. Returns Cow - no allocation if already valid UTF-8.
+ let encoding = Encoding::for_label(charset_bytes).unwrap_or(encoding_rs::UTF_8);
+ let (decoded, _, _) = encoding.decode(input);
+
+ let arena = typed_arena::Arena::new();
+
+ let sink = sink::Sink {
+ ctx: ctx,
+ arena: &arena,
+ document: document,
+ quirks_mode: Cell::new(QuirksMode::NoQuirks),
+ pop_callback: pop_callback,
+ append_callback: append_callback,
+ get_data_callback: get_data_callback,
+ parse_error_callback: parse_error_callback,
+ create_element_callback: create_element_callback,
+ create_comment_callback: create_comment_callback,
+ create_processing_instruction: create_processing_instruction,
+ append_doctype_to_document: append_doctype_to_document,
+ add_attrs_if_missing_callback: add_attrs_if_missing_callback,
+ get_template_contents_callback: get_template_contents_callback,
+ remove_from_parent_callback: remove_from_parent_callback,
+ reparent_children_callback: reparent_children_callback,
+ append_before_sibling_callback: append_before_sibling_callback,
+ append_based_on_parent_node_callback: append_based_on_parent_node_callback,
+ };
+
+ // Parse directly from decoded string
+ parse_document(sink, Default::default())
+ .one(StrTendril::from(decoded.as_ref()));
+}
+
#[no_mangle]
pub extern "C" fn html5ever_parse_fragment(
html: *mut c_uchar,