From 1a034126350acd6b4e8dc15276f06a1feae941da Mon Sep 17 00:00:00 2001 From: Pierre Tachoire Date: Thu, 11 Jan 2024 16:50:21 +0100 Subject: [PATCH 1/4] netsurf: add a test case for html parsing bug https://github.com/lightpanda-io/libdom/issues/4 --- src/run_tests.zig | 9 +++++++++ tests/html/bug-html-parsing-4.html | 6 ++++++ 2 files changed, 15 insertions(+) create mode 100644 tests/html/bug-html-parsing-4.html diff --git a/src/run_tests.zig b/src/run_tests.zig index a270deb1..05c9354d 100644 --- a/src/run_tests.zig +++ b/src/run_tests.zig @@ -113,3 +113,12 @@ test "DocumentHTMLParseFromStr" { doc = try parser.documentHTMLParseFromStr(str); parser.documentHTMLClose(doc) catch {}; } + +// https://github.com/lightpanda-io/libdom/issues/4 +test "bug document html parsing #4" { + const file = try std.fs.cwd().openFile("tests/html/bug-html-parsing-4.html", .{}); + defer file.close(); + + doc = try parser.documentHTMLParse(file.reader()); + parser.documentHTMLClose(doc) catch {}; +} diff --git a/tests/html/bug-html-parsing-4.html b/tests/html/bug-html-parsing-4.html new file mode 100644 index 00000000..391ac0c7 --- /dev/null +++ b/tests/html/bug-html-parsing-4.html @@ -0,0 +1,6 @@ + + + + + From 028cd2331f4637f11c76799efe93aa3f39fea2cd Mon Sep 17 00:00:00 2001 From: Pierre Tachoire Date: Fri, 12 Jan 2024 11:58:49 +0100 Subject: [PATCH 2/4] netsurf: pass encoding to the parser --- src/main.zig | 2 +- src/main_shell.zig | 2 +- src/netsurf.zig | 6 ++++-- src/run_tests.zig | 4 ++-- src/wpt/run.zig | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/main.zig b/src/main.zig index a8b024d2..9341072e 100644 --- a/src/main.zig +++ b/src/main.zig @@ -62,7 +62,7 @@ pub fn main() !void { const file = try std.fs.cwd().openFile("test.html", .{}); defer file.close(); - doc = try parser.documentHTMLParse(file.reader()); + doc = try parser.documentHTMLParse(file.reader(), "UTF-8"); defer parser.documentHTMLClose(doc) catch |err| { std.debug.print("documentHTMLClose error: {s}\n", .{@errorName(err)}); }; diff --git a/src/main_shell.zig b/src/main_shell.zig index ea9276fa..35d1fb39 100644 --- a/src/main_shell.zig +++ b/src/main_shell.zig @@ -43,7 +43,7 @@ pub fn main() !void { const file = try std.fs.cwd().openFile("test.html", .{}); defer file.close(); - doc = try parser.documentHTMLParse(file.reader()); + doc = try parser.documentHTMLParse(file.reader(), "UTF-8"); defer parser.documentHTMLClose(doc) catch |err| { std.debug.print("documentHTMLClose error: {s}\n", .{@errorName(err)}); }; diff --git a/src/netsurf.zig b/src/netsurf.zig index 5633515b..bd639e0a 100644 --- a/src/netsurf.zig +++ b/src/netsurf.zig @@ -1411,10 +1411,10 @@ fn parserErr(err: HubbubErr) ParserError!void { // The caller is responsible for closing the document. pub fn documentHTMLParseFromStr(str: []const u8) !*DocumentHTML { var fbs = std.io.fixedBufferStream(str); - return try documentHTMLParse(fbs.reader()); + return try documentHTMLParse(fbs.reader(), "UTF-8"); } -pub fn documentHTMLParse(reader: anytype) !*DocumentHTML { +pub fn documentHTMLParse(reader: anytype, enc: ?[:0]const u8) !*DocumentHTML { var parser: ?*c.dom_hubbub_parser = undefined; var doc: ?*c.dom_document = undefined; var err: c.hubbub_error = undefined; @@ -1429,6 +1429,8 @@ pub fn documentHTMLParse(reader: anytype) !*DocumentHTML { .daf = null, }; + if (enc) |e| params.enc = e; + err = c.dom_hubbub_parser_create(¶ms, &parser, &doc); try parserErr(err); defer c.dom_hubbub_parser_destroy(parser); diff --git a/src/run_tests.zig b/src/run_tests.zig index 05c9354d..8369ec61 100644 --- a/src/run_tests.zig +++ b/src/run_tests.zig @@ -43,7 +43,7 @@ fn testExecFn( const file = try std.fs.cwd().openFile("test.html", .{}); defer file.close(); - doc = try parser.documentHTMLParse(file.reader()); + doc = try parser.documentHTMLParse(file.reader(), "UTF-8"); defer parser.documentHTMLClose(doc) catch |err| { std.debug.print("documentHTMLClose error: {s}\n", .{@errorName(err)}); }; @@ -119,6 +119,6 @@ test "bug document html parsing #4" { const file = try std.fs.cwd().openFile("tests/html/bug-html-parsing-4.html", .{}); defer file.close(); - doc = try parser.documentHTMLParse(file.reader()); + doc = try parser.documentHTMLParse(file.reader(), null); parser.documentHTMLClose(doc) catch {}; } diff --git a/src/wpt/run.zig b/src/wpt/run.zig index 3d7ca437..65d5ee67 100644 --- a/src/wpt/run.zig +++ b/src/wpt/run.zig @@ -21,7 +21,7 @@ pub fn run(arena: *std.heap.ArenaAllocator, comptime dir: []const u8, f: []const const file = try std.fs.cwd().openFile(f, .{}); defer file.close(); - const html_doc = try parser.documentHTMLParse(file.reader()); + const html_doc = try parser.documentHTMLParse(file.reader(), "UTF-8"); const doc = parser.documentHTMLToDocument(html_doc); const dirname = fspath.dirname(f[dir.len..]) orelse unreachable; From 0f246607072a1a1d5f631132281fd1c5070727a2 Mon Sep 17 00:00:00 2001 From: Pierre Tachoire Date: Fri, 12 Jan 2024 15:19:31 +0100 Subject: [PATCH 3/4] netsurf: fix parser error values --- src/netsurf.zig | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/netsurf.zig b/src/netsurf.zig index bd639e0a..b24c5d26 100644 --- a/src/netsurf.zig +++ b/src/netsurf.zig @@ -1380,6 +1380,8 @@ const ParserError = error{ EncodingChange, Paused, NoMemory, + Dom, + Hubbub, BadParameter, BadEncoding, Invalid, @@ -1392,17 +1394,20 @@ const HubbubErr = c.hubbub_error; fn parserErr(err: HubbubErr) ParserError!void { return switch (err) { - c.HUBBUB_OK => {}, - c.HUBBUB_REPROCESS => ParserError.Reprocess, - c.HUBBUB_ENCODINGCHANGE => ParserError.EncodingChange, - c.HUBBUB_PAUSED => ParserError.Paused, - c.HUBBUB_NOMEM => ParserError.NoMemory, - c.HUBBUB_BADPARM => ParserError.BadParameter, - c.HUBBUB_BADENCODING => ParserError.BadEncoding, - c.HUBBUB_INVALID => ParserError.Invalid, - c.HUBBUB_FILENOTFOUND => ParserError.FileNotFound, - c.HUBBUB_NEEDDATA => ParserError.NeedData, - c.HUBBUB_UNKNOWN => ParserError.Unknown, + c.DOM_HUBBUB_OK => {}, + c.DOM_HUBBUB_NOMEM => ParserError.NoMemory, + c.DOM_HUBBUB_BADPARM => ParserError.BadParameter, + c.DOM_HUBBUB_DOM => ParserError.Dom, + c.DOM_HUBBUB_HUBBUB_ERR => ParserError.Hubbub, + c.DOM_HUBBUB_HUBBUB_ERR_PAUSED => ParserError.Paused, + c.DOM_HUBBUB_HUBBUB_ERR_ENCODINGCHANGE => ParserError.EncodingChange, + c.DOM_HUBBUB_HUBBUB_ERR_NOMEM => ParserError.NoMemory, + c.DOM_HUBBUB_HUBBUB_ERR_BADPARM => ParserError.BadParameter, + c.DOM_HUBBUB_HUBBUB_ERR_INVALID => ParserError.Invalid, + c.DOM_HUBBUB_HUBBUB_ERR_FILENOTFOUND => ParserError.FileNotFound, + c.DOM_HUBBUB_HUBBUB_ERR_NEEDDATA => ParserError.NeedData, + c.DOM_HUBBUB_HUBBUB_ERR_BADENCODING => ParserError.BadEncoding, + c.DOM_HUBBUB_HUBBUB_ERR_UNKNOWN => ParserError.Unknown, else => unreachable, }; } From 2981703b7ff30140346bfc76078f492914ca41bf Mon Sep 17 00:00:00 2001 From: Pierre Tachoire Date: Fri, 12 Jan 2024 15:26:29 +0100 Subject: [PATCH 4/4] netsurf: add TODO on encoding change error code --- src/netsurf.zig | 7 +++++++ src/run_tests.zig | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/netsurf.zig b/src/netsurf.zig index b24c5d26..dca13979 100644 --- a/src/netsurf.zig +++ b/src/netsurf.zig @@ -1445,6 +1445,13 @@ pub fn documentHTMLParse(reader: anytype, enc: ?[:0]const u8) !*DocumentHTML { while (ln > 0) { ln = try reader.read(&buffer); err = c.dom_hubbub_parser_parse_chunk(parser, &buffer, ln); + // TODO handle encoding change error return. + // When the HTML contains a META tag with a different encoding than the + // original one, a c.DOM_HUBBUB_HUBBUB_ERR_ENCODINGCHANGE error is + // returned. + // In this case, we must restart the parsing with the new detected + // encoding. The detected encoding is stored in the document and we can + // get it with documentGetInputEncoding(). try parserErr(err); } diff --git a/src/run_tests.zig b/src/run_tests.zig index 8369ec61..1ba0f8b3 100644 --- a/src/run_tests.zig +++ b/src/run_tests.zig @@ -119,6 +119,6 @@ test "bug document html parsing #4" { const file = try std.fs.cwd().openFile("tests/html/bug-html-parsing-4.html", .{}); defer file.close(); - doc = try parser.documentHTMLParse(file.reader(), null); + doc = try parser.documentHTMLParse(file.reader(), "UTF-8"); parser.documentHTMLClose(doc) catch {}; }