diff --git a/src/main.zig b/src/main.zig index a8b024d2..9341072e 100644 --- a/src/main.zig +++ b/src/main.zig @@ -62,7 +62,7 @@ pub fn main() !void { const file = try std.fs.cwd().openFile("test.html", .{}); defer file.close(); - doc = try parser.documentHTMLParse(file.reader()); + doc = try parser.documentHTMLParse(file.reader(), "UTF-8"); defer parser.documentHTMLClose(doc) catch |err| { std.debug.print("documentHTMLClose error: {s}\n", .{@errorName(err)}); }; diff --git a/src/main_shell.zig b/src/main_shell.zig index ea9276fa..35d1fb39 100644 --- a/src/main_shell.zig +++ b/src/main_shell.zig @@ -43,7 +43,7 @@ pub fn main() !void { const file = try std.fs.cwd().openFile("test.html", .{}); defer file.close(); - doc = try parser.documentHTMLParse(file.reader()); + doc = try parser.documentHTMLParse(file.reader(), "UTF-8"); defer parser.documentHTMLClose(doc) catch |err| { std.debug.print("documentHTMLClose error: {s}\n", .{@errorName(err)}); }; diff --git a/src/netsurf.zig b/src/netsurf.zig index 5633515b..dca13979 100644 --- a/src/netsurf.zig +++ b/src/netsurf.zig @@ -1380,6 +1380,8 @@ const ParserError = error{ EncodingChange, Paused, NoMemory, + Dom, + Hubbub, BadParameter, BadEncoding, Invalid, @@ -1392,17 +1394,20 @@ const HubbubErr = c.hubbub_error; fn parserErr(err: HubbubErr) ParserError!void { return switch (err) { - c.HUBBUB_OK => {}, - c.HUBBUB_REPROCESS => ParserError.Reprocess, - c.HUBBUB_ENCODINGCHANGE => ParserError.EncodingChange, - c.HUBBUB_PAUSED => ParserError.Paused, - c.HUBBUB_NOMEM => ParserError.NoMemory, - c.HUBBUB_BADPARM => ParserError.BadParameter, - c.HUBBUB_BADENCODING => ParserError.BadEncoding, - c.HUBBUB_INVALID => ParserError.Invalid, - c.HUBBUB_FILENOTFOUND => ParserError.FileNotFound, - c.HUBBUB_NEEDDATA => ParserError.NeedData, - c.HUBBUB_UNKNOWN => ParserError.Unknown, + c.DOM_HUBBUB_OK => {}, + c.DOM_HUBBUB_NOMEM => ParserError.NoMemory, + c.DOM_HUBBUB_BADPARM => ParserError.BadParameter, + c.DOM_HUBBUB_DOM => ParserError.Dom, + c.DOM_HUBBUB_HUBBUB_ERR => ParserError.Hubbub, + c.DOM_HUBBUB_HUBBUB_ERR_PAUSED => ParserError.Paused, + c.DOM_HUBBUB_HUBBUB_ERR_ENCODINGCHANGE => ParserError.EncodingChange, + c.DOM_HUBBUB_HUBBUB_ERR_NOMEM => ParserError.NoMemory, + c.DOM_HUBBUB_HUBBUB_ERR_BADPARM => ParserError.BadParameter, + c.DOM_HUBBUB_HUBBUB_ERR_INVALID => ParserError.Invalid, + c.DOM_HUBBUB_HUBBUB_ERR_FILENOTFOUND => ParserError.FileNotFound, + c.DOM_HUBBUB_HUBBUB_ERR_NEEDDATA => ParserError.NeedData, + c.DOM_HUBBUB_HUBBUB_ERR_BADENCODING => ParserError.BadEncoding, + c.DOM_HUBBUB_HUBBUB_ERR_UNKNOWN => ParserError.Unknown, else => unreachable, }; } @@ -1411,10 +1416,10 @@ fn parserErr(err: HubbubErr) ParserError!void { // The caller is responsible for closing the document. pub fn documentHTMLParseFromStr(str: []const u8) !*DocumentHTML { var fbs = std.io.fixedBufferStream(str); - return try documentHTMLParse(fbs.reader()); + return try documentHTMLParse(fbs.reader(), "UTF-8"); } -pub fn documentHTMLParse(reader: anytype) !*DocumentHTML { +pub fn documentHTMLParse(reader: anytype, enc: ?[:0]const u8) !*DocumentHTML { var parser: ?*c.dom_hubbub_parser = undefined; var doc: ?*c.dom_document = undefined; var err: c.hubbub_error = undefined; @@ -1429,6 +1434,8 @@ pub fn documentHTMLParse(reader: anytype) !*DocumentHTML { .daf = null, }; + if (enc) |e| params.enc = e; + err = c.dom_hubbub_parser_create(¶ms, &parser, &doc); try parserErr(err); defer c.dom_hubbub_parser_destroy(parser); @@ -1438,6 +1445,13 @@ pub fn documentHTMLParse(reader: anytype) !*DocumentHTML { while (ln > 0) { ln = try reader.read(&buffer); err = c.dom_hubbub_parser_parse_chunk(parser, &buffer, ln); + // TODO handle encoding change error return. + // When the HTML contains a META tag with a different encoding than the + // original one, a c.DOM_HUBBUB_HUBBUB_ERR_ENCODINGCHANGE error is + // returned. + // In this case, we must restart the parsing with the new detected + // encoding. The detected encoding is stored in the document and we can + // get it with documentGetInputEncoding(). try parserErr(err); } diff --git a/src/run_tests.zig b/src/run_tests.zig index a270deb1..1ba0f8b3 100644 --- a/src/run_tests.zig +++ b/src/run_tests.zig @@ -43,7 +43,7 @@ fn testExecFn( const file = try std.fs.cwd().openFile("test.html", .{}); defer file.close(); - doc = try parser.documentHTMLParse(file.reader()); + doc = try parser.documentHTMLParse(file.reader(), "UTF-8"); defer parser.documentHTMLClose(doc) catch |err| { std.debug.print("documentHTMLClose error: {s}\n", .{@errorName(err)}); }; @@ -113,3 +113,12 @@ test "DocumentHTMLParseFromStr" { doc = try parser.documentHTMLParseFromStr(str); parser.documentHTMLClose(doc) catch {}; } + +// https://github.com/lightpanda-io/libdom/issues/4 +test "bug document html parsing #4" { + const file = try std.fs.cwd().openFile("tests/html/bug-html-parsing-4.html", .{}); + defer file.close(); + + doc = try parser.documentHTMLParse(file.reader(), "UTF-8"); + parser.documentHTMLClose(doc) catch {}; +} diff --git a/src/wpt/run.zig b/src/wpt/run.zig index 3d7ca437..65d5ee67 100644 --- a/src/wpt/run.zig +++ b/src/wpt/run.zig @@ -21,7 +21,7 @@ pub fn run(arena: *std.heap.ArenaAllocator, comptime dir: []const u8, f: []const const file = try std.fs.cwd().openFile(f, .{}); defer file.close(); - const html_doc = try parser.documentHTMLParse(file.reader()); + const html_doc = try parser.documentHTMLParse(file.reader(), "UTF-8"); const doc = parser.documentHTMLToDocument(html_doc); const dirname = fspath.dirname(f[dir.len..]) orelse unreachable; diff --git a/tests/html/bug-html-parsing-4.html b/tests/html/bug-html-parsing-4.html new file mode 100644 index 00000000..391ac0c7 --- /dev/null +++ b/tests/html/bug-html-parsing-4.html @@ -0,0 +1,6 @@ + +
+ + +