mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-06-11 17:46:32 -04:00
Merge pull request #2129 from lightpanda-io/non-utf8-querystring-encoding
Non utf8 querystring encoding
This commit is contained in:
@@ -671,7 +671,7 @@ pub fn getNodeDetails(
|
||||
|
||||
if (el.getAttributeSafe(comptime .wrap("href"))) |h| {
|
||||
const URL = lp.URL;
|
||||
href = URL.resolve(arena, page.base(), h, .{ .encode = true }) catch h;
|
||||
href = URL.resolve(arena, page.base(), h, .{ .encoding = page.charset }) catch h;
|
||||
}
|
||||
|
||||
if (el.is(Element.Html.Input)) |input| {
|
||||
|
||||
@@ -207,6 +207,9 @@ base_url: ?[:0]const u8 = null,
|
||||
// referer header cache.
|
||||
referer_header: ?[:0]const u8 = null,
|
||||
|
||||
// Document charset (canonical name from encoding_rs, static lifetime)
|
||||
charset: []const u8 = "UTF-8",
|
||||
|
||||
// Arbitrary buffer. Need to temporarily lowercase a value? Use this. No lifetime
|
||||
// guarantee - it's valid until someone else uses it.
|
||||
buf: [BUF_SIZE]u8 = undefined,
|
||||
@@ -658,7 +661,7 @@ fn scheduleNavigationWithArena(originator: *Page, arena: Allocator, request_url:
|
||||
arena,
|
||||
page_base,
|
||||
request_url,
|
||||
.{ .always_dupe = true, .encode = true },
|
||||
.{ .always_dupe = true, .encoding = originator.charset },
|
||||
);
|
||||
break :blk .{ u, false };
|
||||
};
|
||||
@@ -962,9 +965,13 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
|
||||
|
||||
switch (mime.content_type) {
|
||||
.text_html => {
|
||||
self._parse_state = .{ .html = .{
|
||||
.mime = mime,
|
||||
} };
|
||||
// Normalize and store the charset using encoding_rs canonical names
|
||||
const charset_str = mime.charsetString();
|
||||
const info = h5e.encoding_for_label(charset_str.ptr, charset_str.len);
|
||||
if (info.isValid()) {
|
||||
self.charset = info.name();
|
||||
}
|
||||
self._parse_state = .{ .html = .empty };
|
||||
},
|
||||
.application_json, .text_javascript, .text_css, .text_plain => {
|
||||
var arr: std.ArrayList(u8) = .empty;
|
||||
@@ -979,7 +986,7 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
|
||||
}
|
||||
|
||||
switch (self._parse_state) {
|
||||
.html => |*html| try html.buf.appendSlice(self.arena, data),
|
||||
.html => |*html| try html.appendSlice(self.arena, data),
|
||||
.text => |*buf| {
|
||||
// we have to escape the data...
|
||||
var v = data;
|
||||
@@ -1028,12 +1035,13 @@ fn pageDoneCallback(ctx: *anyopaque) !void {
|
||||
var parser = Parser.init(parse_arena, self.document.asNode(), self);
|
||||
|
||||
switch (self._parse_state) {
|
||||
.html => |*html_state| {
|
||||
const raw_html = html_state.buf.items;
|
||||
if (html_state.needsEncodingConversion()) {
|
||||
parser.parseWithEncoding(raw_html, html_state.mime.charsetString());
|
||||
} else {
|
||||
.html => |*html_buf| {
|
||||
const raw_html = html_buf.items;
|
||||
|
||||
if (std.mem.eql(u8, self.charset, "UTF-8")) {
|
||||
parser.parse(raw_html);
|
||||
} else {
|
||||
parser.parseWithEncoding(raw_html, self.charset);
|
||||
}
|
||||
self._script_manager.staticScriptsDone();
|
||||
self._parse_state = .complete;
|
||||
@@ -1188,7 +1196,7 @@ pub fn iframeAddedCallback(self: *Page, iframe: *IFrame) !void {
|
||||
self.call_arena, // ok to use, page.navigate dupes this
|
||||
self.base(),
|
||||
src,
|
||||
.{ .encode = true },
|
||||
.{ .encoding = self.charset },
|
||||
);
|
||||
};
|
||||
|
||||
@@ -3164,21 +3172,11 @@ const ParseState = union(enum) {
|
||||
pre,
|
||||
complete,
|
||||
err: anyerror,
|
||||
html: Html,
|
||||
html: std.ArrayList(u8),
|
||||
text: std.ArrayList(u8),
|
||||
image: std.ArrayList(u8),
|
||||
raw: std.ArrayList(u8),
|
||||
raw_done: []const u8,
|
||||
|
||||
const Html = struct {
|
||||
mime: Mime,
|
||||
buf: std.ArrayList(u8) = .empty,
|
||||
|
||||
fn needsEncodingConversion(self: *const Html) bool {
|
||||
const charset = self.mime.charsetString();
|
||||
return !std.ascii.eqlIgnoreCase(charset, "utf-8") and !std.ascii.eqlIgnoreCase(charset, "utf8");
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
const LoadState = enum {
|
||||
@@ -3628,9 +3626,6 @@ fn asUint(comptime string: anytype) std.meta.Int(
|
||||
|
||||
const testing = @import("../testing.zig");
|
||||
test "WebApi: Page" {
|
||||
const filter: testing.LogFilter = .init(&.{ .http, .js });
|
||||
defer filter.deinit();
|
||||
|
||||
try testing.htmlRunner("page", .{});
|
||||
}
|
||||
|
||||
|
||||
@@ -19,16 +19,19 @@
|
||||
const std = @import("std");
|
||||
const Allocator = std.mem.Allocator;
|
||||
|
||||
const ResolveOpts = struct {
|
||||
encode: bool = false,
|
||||
pub const ResolveOpts = struct {
|
||||
/// null = don't encode, "UTF-8" = standard percent encoding,
|
||||
/// other charset = encode query string using that charset with NCR fallback
|
||||
encoding: ?[]const u8 = null,
|
||||
always_dupe: bool = false,
|
||||
};
|
||||
|
||||
// path is anytype, so that it can be used with both []const u8 and [:0]const u8
|
||||
pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, comptime opts: ResolveOpts) ![:0]const u8 {
|
||||
pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, opts: ResolveOpts) ![:0]const u8 {
|
||||
const PT = @TypeOf(source_path);
|
||||
|
||||
var path: [:0]const u8 = if (comptime !isNullTerminated(PT) or opts.always_dupe) try allocator.dupeZ(u8, source_path) else source_path;
|
||||
const needs_dupe = comptime !isNullTerminated(PT);
|
||||
var path: [:0]const u8 = if (needs_dupe or opts.always_dupe) try allocator.dupeZ(u8, source_path) else source_path;
|
||||
|
||||
if (base.len == 0) {
|
||||
return processResolved(allocator, path, opts);
|
||||
@@ -186,14 +189,12 @@ pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, c
|
||||
return processResolved(allocator, out[0..out_i :0], opts);
|
||||
}
|
||||
|
||||
fn processResolved(allocator: Allocator, url: [:0]const u8, comptime opts: ResolveOpts) ![:0]const u8 {
|
||||
if (!comptime opts.encode) {
|
||||
return url;
|
||||
}
|
||||
return ensureEncoded(allocator, url);
|
||||
fn processResolved(allocator: Allocator, url: [:0]const u8, opts: ResolveOpts) ![:0]const u8 {
|
||||
const encoding = opts.encoding orelse return url;
|
||||
return ensureEncoded(allocator, url, encoding);
|
||||
}
|
||||
|
||||
pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
|
||||
pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8, encoding: []const u8) ![:0]const u8 {
|
||||
const scheme_end = std.mem.indexOf(u8, url, "://");
|
||||
const authority_start = if (scheme_end) |end| end + 3 else 0;
|
||||
const path_start = std.mem.indexOfScalarPos(u8, url, authority_start, '/') orelse return url;
|
||||
@@ -205,18 +206,18 @@ pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
|
||||
const query_end = if (query_start) |_| (fragment_start orelse url.len) else path_end;
|
||||
|
||||
const path_to_encode = url[path_start..path_end];
|
||||
// Path is always UTF-8 percent encoded per URL spec
|
||||
const encoded_path = try percentEncodeSegment(allocator, path_to_encode, .path);
|
||||
|
||||
// Query string uses document encoding
|
||||
const encoded_query = if (query_start) |qs| blk: {
|
||||
const query_to_encode = url[qs + 1 .. query_end];
|
||||
const encoded = try percentEncodeSegment(allocator, query_to_encode, .query);
|
||||
break :blk encoded;
|
||||
break :blk try encodeQueryString(allocator, query_to_encode, encoding);
|
||||
} else null;
|
||||
|
||||
const encoded_fragment = if (fragment_start) |fs| blk: {
|
||||
const fragment_to_encode = url[fs + 1 ..];
|
||||
const encoded = try percentEncodeSegment(allocator, fragment_to_encode, .query);
|
||||
break :blk encoded;
|
||||
break :blk try percentEncodeSegment(allocator, fragment_to_encode, .query);
|
||||
} else null;
|
||||
|
||||
if (encoded_path.ptr == path_to_encode.ptr and
|
||||
@@ -242,7 +243,7 @@ pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
|
||||
return buf.items[0 .. buf.items.len - 1 :0];
|
||||
}
|
||||
|
||||
const EncodeSet = enum { path, query, userinfo, fragment };
|
||||
const EncodeSet = enum { path, query, query_legacy, userinfo, fragment };
|
||||
|
||||
fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime encode_set: EncodeSet) ![]const u8 {
|
||||
// Check if encoding is needed
|
||||
@@ -285,17 +286,65 @@ fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime enco
|
||||
return buf.items;
|
||||
}
|
||||
|
||||
const h5e = @import("parser/html5ever.zig");
|
||||
|
||||
/// Encode a query string using the specified encoding.
|
||||
/// For UTF-8, this is standard percent encoding.
|
||||
/// For legacy encodings, unmappable characters are replaced with NCRs (&#codepoint;).
|
||||
fn encodeQueryString(allocator: Allocator, query: []const u8, encoding: []const u8) ![]const u8 {
|
||||
// For UTF-8, use standard percent encoding
|
||||
if (std.mem.eql(u8, encoding, "UTF-8")) {
|
||||
return percentEncodeSegment(allocator, query, .query);
|
||||
}
|
||||
|
||||
// For legacy encodings, first encode to the target charset with NCR fallback
|
||||
const enc_info = h5e.encoding_for_label(encoding.ptr, encoding.len);
|
||||
if (!enc_info.isValid()) {
|
||||
// Unknown encoding, fall back to UTF-8
|
||||
return percentEncodeSegment(allocator, query, .query);
|
||||
}
|
||||
|
||||
// Calculate max buffer size for encoded output
|
||||
const max_encoded_len = h5e.encoding_max_encode_buffer_length(enc_info.handle.?, query.len);
|
||||
if (max_encoded_len == 0) {
|
||||
return percentEncodeSegment(allocator, query, .query);
|
||||
}
|
||||
|
||||
const encode_buf = try allocator.alloc(u8, max_encoded_len);
|
||||
defer allocator.free(encode_buf);
|
||||
|
||||
// Encode UTF-8 to legacy encoding with NCR fallback
|
||||
const result = h5e.encoding_encode_with_ncr(
|
||||
enc_info.handle.?,
|
||||
query.ptr,
|
||||
query.len,
|
||||
encode_buf.ptr,
|
||||
encode_buf.len,
|
||||
);
|
||||
|
||||
if (!result.isSuccess()) {
|
||||
// Encoding failed, fall back to UTF-8
|
||||
return percentEncodeSegment(allocator, query, .query);
|
||||
}
|
||||
|
||||
// Now percent-encode the result using query_legacy to preserve NCRs
|
||||
const encoded_bytes = encode_buf[0..result.bytes_written];
|
||||
return percentEncodeSegment(allocator, encoded_bytes, .query_legacy);
|
||||
}
|
||||
|
||||
fn shouldPercentEncode(c: u8, comptime encode_set: EncodeSet) bool {
|
||||
return switch (c) {
|
||||
// Unreserved characters (RFC 3986)
|
||||
'A'...'Z', 'a'...'z', '0'...'9', '-', '.', '_', '~' => false,
|
||||
// sub-delims allowed in path/query but some must be encoded in userinfo
|
||||
'!', '$', '&', '\'', '(', ')', '*', '+', ',' => false,
|
||||
';', '=' => encode_set == .userinfo,
|
||||
// sub-delims allowed in path/query but some must be encoded in userinfo/query_legacy
|
||||
'!', '$', '\'', '(', ')', '*', '+', ',' => false,
|
||||
// '&' and ';' must be encoded for legacy encoding (to preserve NCRs like &#nnnnn;)
|
||||
'&', ';' => encode_set == .userinfo or encode_set == .query_legacy,
|
||||
'=' => encode_set == .userinfo,
|
||||
// Separators: userinfo must encode these
|
||||
'/', ':', '@' => encode_set == .userinfo,
|
||||
// '?' is allowed in queries only
|
||||
'?' => encode_set != .query,
|
||||
'?' => encode_set != .query and encode_set != .query_legacy,
|
||||
// '#' is allowed in fragments only
|
||||
'#' => encode_set != .fragment,
|
||||
// Everything else needs encoding (including space)
|
||||
@@ -1130,7 +1179,7 @@ test "URL: ensureEncoded" {
|
||||
};
|
||||
|
||||
for (cases) |case| {
|
||||
const result = try ensureEncoded(testing.arena_allocator, case.url);
|
||||
const result = try ensureEncoded(testing.arena_allocator, case.url, "UTF-8");
|
||||
try testing.expectString(case.expected, result);
|
||||
}
|
||||
}
|
||||
@@ -1296,7 +1345,7 @@ test "URL: resolve with encoding" {
|
||||
};
|
||||
|
||||
for (cases) |case| {
|
||||
const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encode = true });
|
||||
const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encoding = "UTF-8" });
|
||||
try testing.expectString(case.expected, result);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,7 +182,7 @@ pub fn collectInteractiveElements(
|
||||
.id = el.getAttributeSafe(comptime .wrap("id")),
|
||||
.class = el.getAttributeSafe(comptime .wrap("class")),
|
||||
.href = if (el.getAttributeSafe(comptime .wrap("href"))) |href|
|
||||
URL.resolve(arena, page.base(), href, .{ .encode = true }) catch href
|
||||
URL.resolve(arena, page.base(), href, .{ .encoding = page.charset }) catch href
|
||||
else
|
||||
null,
|
||||
.input_type = getInputType(el),
|
||||
|
||||
@@ -278,7 +278,8 @@ const Context = struct {
|
||||
}
|
||||
try self.writer.writeAll("](");
|
||||
if (el.getAttributeSafe(comptime .wrap("src"))) |src| {
|
||||
const absolute_src = URL.resolve(self.page.call_arena, self.page.base(), src, .{ .encode = true }) catch src;
|
||||
const page = self.page;
|
||||
const absolute_src = URL.resolve(page.call_arena, page.base(), src, .{ .encoding = page.charset }) catch src;
|
||||
try self.writer.writeAll(absolute_src);
|
||||
}
|
||||
try self.writer.writeAll(")");
|
||||
@@ -286,13 +287,14 @@ const Context = struct {
|
||||
return;
|
||||
},
|
||||
.anchor => {
|
||||
const page = self.page;
|
||||
const info = analyzeContent(el.asNode());
|
||||
const label = getAnchorLabel(el);
|
||||
const href_raw = el.getAttributeSafe(comptime .wrap("href"));
|
||||
|
||||
if (!info.has_visible and label == null and href_raw == null) return;
|
||||
|
||||
const href = if (href_raw) |h| URL.resolve(self.page.call_arena, self.page.base(), h, .{ .encode = true }) catch h else null;
|
||||
const href = if (href_raw) |h| URL.resolve(page.call_arena, page.base(), h, .{ .encoding = page.charset }) catch h else null;
|
||||
|
||||
if (info.has_block) {
|
||||
try self.renderChildren(el.asNode());
|
||||
|
||||
@@ -216,3 +216,89 @@ pub extern "c" fn xml5ever_parse_document(
|
||||
appendBeforeSiblingCallback: *const fn (ctx: *anyopaque, sibling_ref: *anyopaque, NodeOrText) callconv(.c) void,
|
||||
appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
|
||||
) void;
|
||||
|
||||
// General encoding api
|
||||
pub const EncodingInfo = extern struct {
|
||||
found: u8,
|
||||
handle: ?*anyopaque,
|
||||
name_len: usize,
|
||||
name_ptr: [*]const u8,
|
||||
|
||||
pub fn isValid(self: *const EncodingInfo) bool {
|
||||
return self.found != 0;
|
||||
}
|
||||
|
||||
pub fn name(self: *const EncodingInfo) []const u8 {
|
||||
if (self.name_len == 0) {
|
||||
return "";
|
||||
}
|
||||
return self.name_ptr[0..self.name_len];
|
||||
}
|
||||
};
|
||||
|
||||
pub const DecodeResult = extern struct {
|
||||
had_errors: u8,
|
||||
bytes_read: usize,
|
||||
bytes_written: usize,
|
||||
|
||||
pub fn hadErrors(self: *const DecodeResult) bool {
|
||||
return self.had_errors != 0;
|
||||
}
|
||||
};
|
||||
|
||||
pub extern "c" fn encoding_for_label(
|
||||
label: [*]const u8,
|
||||
label_len: usize,
|
||||
) EncodingInfo;
|
||||
|
||||
pub extern "c" fn encoding_max_utf8_buffer_length(
|
||||
handle: *anyopaque,
|
||||
input_len: usize,
|
||||
) usize;
|
||||
|
||||
pub extern "c" fn encoding_decode(
|
||||
handle: *anyopaque,
|
||||
input: ?[*]const u8,
|
||||
input_len: usize,
|
||||
output: [*]u8,
|
||||
output_len: usize,
|
||||
is_last: u8,
|
||||
) DecodeResult;
|
||||
|
||||
// Streaming decoder API
|
||||
pub extern "c" fn encoding_decoder_new(handle: *anyopaque) ?*anyopaque;
|
||||
|
||||
pub extern "c" fn encoding_decoder_decode(
|
||||
decoder: *anyopaque,
|
||||
input: ?[*]const u8,
|
||||
input_len: usize,
|
||||
output: [*]u8,
|
||||
output_len: usize,
|
||||
is_last: u8,
|
||||
) DecodeResult;
|
||||
|
||||
pub extern "c" fn encoding_decoder_free(decoder: *anyopaque) void;
|
||||
|
||||
// Encoding API (UTF-8 to legacy encoding with NCR fallback)
|
||||
pub const EncodeResult = extern struct {
|
||||
status: u8,
|
||||
bytes_read: usize,
|
||||
bytes_written: usize,
|
||||
|
||||
pub fn isSuccess(self: *const EncodeResult) bool {
|
||||
return self.status == 0;
|
||||
}
|
||||
};
|
||||
|
||||
pub extern "c" fn encoding_encode_with_ncr(
|
||||
handle: *anyopaque,
|
||||
input: ?[*]const u8,
|
||||
input_len: usize,
|
||||
output: [*]u8,
|
||||
output_capacity: usize,
|
||||
) EncodeResult;
|
||||
|
||||
pub extern "c" fn encoding_max_encode_buffer_length(
|
||||
handle: *anyopaque,
|
||||
input_len: usize,
|
||||
) usize;
|
||||
|
||||
@@ -288,7 +288,7 @@ fn collectLink(
|
||||
) !void {
|
||||
const rel = el.getAttributeSafe(comptime .wrap("rel")) orelse return;
|
||||
const raw_href = el.getAttributeSafe(comptime .wrap("href")) orelse return;
|
||||
const href = URL.resolve(arena, page.base(), raw_href, .{ .encode = true }) catch raw_href;
|
||||
const href = URL.resolve(arena, page.base(), raw_href, .{ .encoding = page.charset }) catch raw_href;
|
||||
|
||||
if (std.ascii.eqlIgnoreCase(rel, "alternate")) {
|
||||
try alternate.append(arena, .{
|
||||
|
||||
@@ -18,6 +18,10 @@
|
||||
testing.expectEqual("visible", document.visibilityState);
|
||||
testing.expectEqual(false, document.prerendering);
|
||||
testing.expectEqual(undefined, Document.prerendering);
|
||||
// characterSet should return canonical encoding name
|
||||
testing.expectEqual("UTF-8", document.characterSet);
|
||||
testing.expectEqual("UTF-8", document.charset);
|
||||
testing.expectEqual("UTF-8", document.inputEncoding);
|
||||
</script>
|
||||
|
||||
<script id=headAndbody>
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
testing.expectEqual('', d1.decode());
|
||||
testing.expectEqual('香料', d1.decode(new Uint8Array([233, 166, 153, 230, 150, 153])));
|
||||
testing.expectEqual('香料', d1.decode(new Uint8Array([0xEF, 0xBB, 0xBF, 233, 166, 153, 230, 150, 153])));
|
||||
testing.expectEqual('<27>4', d1.decode(new Uint8Array([249, 52])));
|
||||
|
||||
{
|
||||
const buffer = new ArrayBuffer(6);
|
||||
@@ -38,7 +37,7 @@
|
||||
}
|
||||
|
||||
let d2 = new TextDecoder('utf8', {fatal: true})
|
||||
testing.expectError('Error: InvalidUtf8', () => {
|
||||
testing.expectError('TypeError', () => {
|
||||
let data = new Uint8Array([241, 241, 159, 172]);
|
||||
d2.decode(data);
|
||||
});
|
||||
@@ -46,8 +45,8 @@
|
||||
|
||||
<script id=stream>
|
||||
let d3 = new TextDecoder();
|
||||
testing.expectEqual('', d2.decode(new Uint8Array([226, 153]), { stream: true }));
|
||||
testing.expectEqual('♥', d2.decode(new Uint8Array([165]), { stream: true }));
|
||||
testing.expectEqual('', d3.decode(new Uint8Array([226, 153]), { stream: true }));
|
||||
testing.expectEqual('♥', d3.decode(new Uint8Array([165]), { stream: true }));
|
||||
</script>
|
||||
|
||||
<script id=slice>
|
||||
@@ -60,5 +59,69 @@
|
||||
arr1[4] = 84;
|
||||
arr1[5] = 85;
|
||||
arr1[6] = 86;
|
||||
testing.expectEqual('RST', d3.decode(new Uint8Array(buf1, 2, 3)));
|
||||
let d4 = new TextDecoder();
|
||||
testing.expectEqual('RST', d4.decode(new Uint8Array(buf1, 2, 3)));
|
||||
</script>
|
||||
|
||||
<script id=legacy_encodings>
|
||||
// GBK (Chinese)
|
||||
let gbk = new TextDecoder('gbk');
|
||||
testing.expectEqual('gbk', gbk.encoding);
|
||||
testing.expectEqual('中文', gbk.decode(new Uint8Array([0xD6, 0xD0, 0xCE, 0xC4])));
|
||||
|
||||
// Shift_JIS (Japanese)
|
||||
let sjis = new TextDecoder('shift_jis');
|
||||
testing.expectEqual('shift_jis', sjis.encoding);
|
||||
testing.expectEqual('日本語', sjis.decode(new Uint8Array([0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA])));
|
||||
|
||||
// EUC-JP (Japanese)
|
||||
let eucjp = new TextDecoder('euc-jp');
|
||||
testing.expectEqual('euc-jp', eucjp.encoding);
|
||||
|
||||
// ISO-8859-1 (Latin-1)
|
||||
let latin1 = new TextDecoder('iso-8859-1');
|
||||
testing.expectEqual('windows-1252', latin1.encoding); // ISO-8859-1 maps to windows-1252 per spec
|
||||
testing.expectEqual('café', latin1.decode(new Uint8Array([0x63, 0x61, 0x66, 0xe9])));
|
||||
|
||||
// Big5 (Traditional Chinese)
|
||||
let big5 = new TextDecoder('big5');
|
||||
testing.expectEqual('big5', big5.encoding);
|
||||
|
||||
// UTF-16LE
|
||||
let utf16le = new TextDecoder('utf-16le');
|
||||
testing.expectEqual('utf-16le', utf16le.encoding);
|
||||
testing.expectEqual('AB', utf16le.decode(new Uint8Array([0x41, 0x00, 0x42, 0x00])));
|
||||
|
||||
// UTF-16BE
|
||||
let utf16be = new TextDecoder('utf-16be');
|
||||
testing.expectEqual('utf-16be', utf16be.encoding);
|
||||
testing.expectEqual('AB', utf16be.decode(new Uint8Array([0x00, 0x41, 0x00, 0x42])));
|
||||
</script>
|
||||
|
||||
<script id=invalid_label>
|
||||
// Test invalid encoding label
|
||||
try {
|
||||
new TextDecoder('invalid-encoding');
|
||||
testing.fail();
|
||||
} catch (e) {
|
||||
testing.expectEqual(true, e.toString().includes('RangeError'));
|
||||
}
|
||||
|
||||
// Test 'replacement' encoding is rejected
|
||||
try {
|
||||
new TextDecoder('replacement');
|
||||
testing.fail();
|
||||
} catch (e) {
|
||||
testing.expectEqual(true, e.toString().includes('RangeError'));
|
||||
}
|
||||
</script>
|
||||
|
||||
<script id=label_variations>
|
||||
// Case insensitive
|
||||
let upper = new TextDecoder('UTF-8');
|
||||
testing.expectEqual('utf-8', upper.encoding);
|
||||
|
||||
// Leading/trailing whitespace
|
||||
let ws = new TextDecoder(' utf-8 ');
|
||||
testing.expectEqual('utf-8', ws.encoding);
|
||||
</script>
|
||||
|
||||
@@ -5,6 +5,9 @@
|
||||
<script id=TextEncoder>
|
||||
var encoder = new TextEncoder();
|
||||
testing.expectEqual('utf-8', encoder.encoding);
|
||||
testing.expectEqual([], Array.from(encoder.encode()));
|
||||
testing.expectEqual([110, 117, 108, 108], Array.from(encoder.encode(null)));
|
||||
testing.expectEqual([], Array.from(encoder.encode(undefined)));
|
||||
testing.expectEqual([226, 130, 172], Array.from(encoder.encode('€')));
|
||||
testing.expectEqual([111,118,101,114,32,57,48,48,48], encoder.encode("over 9000"));
|
||||
</script>
|
||||
|
||||
@@ -11,6 +11,10 @@
|
||||
testing.onload(() => {
|
||||
// GBK-encoded "中文" should be decoded to UTF-8
|
||||
testing.expectEqual('中文', iframe.contentDocument.getElementById('test').textContent);
|
||||
// document.characterSet should return canonical encoding name
|
||||
testing.expectEqual('GBK', iframe.contentDocument.characterSet);
|
||||
testing.expectEqual('GBK', iframe.contentDocument.charset);
|
||||
testing.expectEqual('GBK', iframe.contentDocument.inputEncoding);
|
||||
});
|
||||
}
|
||||
</script>
|
||||
@@ -73,3 +77,32 @@
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<script id="anchor_href_encoding_with_ncr">
|
||||
{
|
||||
// Test that anchor.href encodes unmappable characters as NCRs in non-UTF-8 documents.
|
||||
// When a character can't be represented in the document's encoding, it should become &#nnnnn;
|
||||
// Per WHATWG URL Standard, query strings use document encoding with NCR fallback.
|
||||
const iframe = document.createElement('iframe');
|
||||
document.body.appendChild(iframe);
|
||||
iframe.src = 'encoding/gbk.html';
|
||||
|
||||
testing.onload(() => {
|
||||
testing.expectEqual('GBK', iframe.contentDocument.characterSet);
|
||||
|
||||
// Test 1: U+3D34 (㴴) - a Han character NOT in GBK, should become NCR 㴴
|
||||
const anchor = iframe.contentDocument.createElement('a');
|
||||
iframe.contentDocument.body.appendChild(anchor);
|
||||
anchor.href = 'http://example.com/?q=\u3D34';
|
||||
// The NCR 㴴 percent-encoded is %26%2315668%3B
|
||||
testing.expectEqual('http://example.com/?q=%26%2315668%3B', anchor.href);
|
||||
|
||||
// Test 2: U+4E2D (中) - IS in GBK, should encode to GBK bytes D6D0 then percent-encode
|
||||
const anchor2 = iframe.contentDocument.createElement('a');
|
||||
iframe.contentDocument.body.appendChild(anchor2);
|
||||
anchor2.href = 'http://example.com/?q=\u4E2D';
|
||||
// GBK encoding of 中 is D6 D0, percent-encoded as %D6%D0
|
||||
testing.expectEqual('http://example.com/?q=%D6%D0', anchor2.href);
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
@@ -37,7 +37,13 @@
|
||||
|
||||
function expectError(expected, fn) {
|
||||
withError((err) => {
|
||||
expectEqual(true, err.toString().includes(expected));
|
||||
if (!err.toString().includes(expected)) {
|
||||
console.error(`Expecte error to contains: ${expected}, was: ${err.toString()}`);
|
||||
expectEqual(true, false);
|
||||
} else {
|
||||
// to record a successful case
|
||||
expectTrue(true);
|
||||
}
|
||||
}, fn);
|
||||
}
|
||||
|
||||
|
||||
@@ -1068,10 +1068,15 @@ pub const JsApi = struct {
|
||||
pub const hasFocus = bridge.function(Document.hasFocus, .{});
|
||||
|
||||
pub const prerendering = bridge.property(false, .{ .template = false });
|
||||
pub const characterSet = bridge.property("UTF-8", .{ .template = false });
|
||||
pub const charset = bridge.property("UTF-8", .{ .template = false });
|
||||
pub const inputEncoding = bridge.property("UTF-8", .{ .template = false });
|
||||
pub const characterSet = bridge.accessor(getCharacterSet, null, .{});
|
||||
pub const charset = bridge.accessor(getCharacterSet, null, .{});
|
||||
pub const inputEncoding = bridge.accessor(getCharacterSet, null, .{});
|
||||
pub const compatMode = bridge.property("CSS1Compat", .{ .template = false });
|
||||
|
||||
fn getCharacterSet(self: *const Document) []const u8 {
|
||||
const doc_page = self._page orelse return "UTF-8";
|
||||
return doc_page.charset;
|
||||
}
|
||||
pub const referrer = bridge.property("", .{ .template = false });
|
||||
};
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ const String = @import("../../string.zig").String;
|
||||
|
||||
const js = @import("../js/js.zig");
|
||||
const Page = @import("../Page.zig");
|
||||
const URL = @import("../URL.zig");
|
||||
const reflect = @import("../reflect.zig");
|
||||
|
||||
const EventTarget = @import("EventTarget.zig");
|
||||
@@ -511,6 +512,18 @@ pub fn ownerPage(self: *const Node, default: *Page) *Page {
|
||||
return doc._page orelse default;
|
||||
}
|
||||
|
||||
pub const ResolveURLOpts = struct {
|
||||
allocator: ?Allocator = null,
|
||||
};
|
||||
|
||||
// Resolve a URL relative to this node's owning document.
|
||||
// Uses the document's charset for query string encoding (with NCR fallback for unmappable chars).
|
||||
pub fn resolveURL(self: *const Node, url: anytype, page: *Page, opts: ResolveURLOpts) ![:0]const u8 {
|
||||
const owner_page = self.ownerPage(page);
|
||||
const allocator = opts.allocator orelse page.call_arena;
|
||||
return URL.resolve(allocator, owner_page.base(), url, .{ .encoding = owner_page.charset });
|
||||
}
|
||||
|
||||
pub fn isSameDocumentAs(self: *const Node, other: *const Node, page: *const Page) bool {
|
||||
// Get the root document for each node
|
||||
const self_doc = if (self._type == .document) self._type.document else self.ownerDocument(page);
|
||||
|
||||
@@ -39,12 +39,11 @@ pub fn asNode(self: *Anchor) *Node {
|
||||
}
|
||||
|
||||
pub fn getHref(self: *Anchor, page: *Page) ![]const u8 {
|
||||
const element = self.asElement();
|
||||
const href = element.getAttributeSafe(comptime .wrap("href")) orelse return "";
|
||||
const href = self.asElement().getAttributeSafe(comptime .wrap("href")) orelse return "";
|
||||
if (href.len == 0) {
|
||||
return "";
|
||||
}
|
||||
return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true });
|
||||
return self.asNode().resolveURL(href, page, .{});
|
||||
}
|
||||
|
||||
pub fn setHref(self: *Anchor, value: []const u8, page: *Page) !void {
|
||||
@@ -203,7 +202,7 @@ fn getResolvedHref(self: *Anchor, page: *Page) !?[:0]const u8 {
|
||||
if (href.len == 0) {
|
||||
return null;
|
||||
}
|
||||
return try URL.resolve(page.call_arena, page.base(), href, .{});
|
||||
return try self.asNode().resolveURL(href, page, .{});
|
||||
}
|
||||
|
||||
pub const JsApi = struct {
|
||||
|
||||
@@ -97,7 +97,7 @@ pub fn getAction(self: *Form, page: *Page) ![]const u8 {
|
||||
if (action.len == 0) {
|
||||
return page.url;
|
||||
}
|
||||
return URL.resolve(page.call_arena, page.base(), action, .{ .encode = true });
|
||||
return element.asNode().resolveURL(action, page, .{});
|
||||
}
|
||||
|
||||
pub fn setAction(self: *Form, value: []const u8, page: *Page) !void {
|
||||
|
||||
@@ -48,9 +48,9 @@ pub fn getContentDocument(self: *const IFrame) ?*Document {
|
||||
return window._document;
|
||||
}
|
||||
|
||||
pub fn getSrc(self: *const IFrame, page: *Page) ![:0]const u8 {
|
||||
pub fn getSrc(self: *IFrame, page: *Page) ![:0]const u8 {
|
||||
if (self._src.len == 0) return "";
|
||||
return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true });
|
||||
return self.asNode().resolveURL(self._src, page, .{});
|
||||
}
|
||||
|
||||
pub fn setSrc(self: *IFrame, src: []const u8, page: *Page) !void {
|
||||
|
||||
@@ -40,9 +40,7 @@ pub fn getSrc(self: *const Image, page: *Page) ![]const u8 {
|
||||
if (src.len == 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Always resolve the src against the page URL
|
||||
return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true });
|
||||
return element.asConstNode().resolveURL(src, page, .{});
|
||||
}
|
||||
|
||||
pub fn setSrc(self: *Image, value: []const u8, page: *Page) !void {
|
||||
|
||||
@@ -44,9 +44,7 @@ pub fn getHref(self: *Link, page: *Page) ![]const u8 {
|
||||
if (href.len == 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Always resolve the href against the page URL
|
||||
return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true });
|
||||
return element.asNode().resolveURL(href, page, .{});
|
||||
}
|
||||
|
||||
pub fn setHref(self: *Link, value: []const u8, page: *Page) !void {
|
||||
|
||||
@@ -235,8 +235,7 @@ pub fn getSrc(self: *const Media, page: *Page) ![]const u8 {
|
||||
if (src.len == 0) {
|
||||
return "";
|
||||
}
|
||||
const URL = @import("../../URL.zig");
|
||||
return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true });
|
||||
return element.asConstNode().resolveURL(src, page, .{});
|
||||
}
|
||||
|
||||
pub fn setSrc(self: *Media, value: []const u8, page: *Page) !void {
|
||||
|
||||
@@ -45,9 +45,9 @@ pub fn asNode(self: *Script) *Node {
|
||||
return self.asElement().asNode();
|
||||
}
|
||||
|
||||
pub fn getSrc(self: *const Script, page: *Page) ![]const u8 {
|
||||
pub fn getSrc(self: *Script, page: *Page) ![]const u8 {
|
||||
if (self._src.len == 0) return "";
|
||||
return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true });
|
||||
return self.asNode().resolveURL(self._src, page, .{});
|
||||
}
|
||||
|
||||
pub fn setSrc(self: *Script, src: []const u8, page: *Page) !void {
|
||||
|
||||
@@ -57,9 +57,7 @@ pub fn getPoster(self: *const Video, page: *Page) ![]const u8 {
|
||||
if (poster.len == 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const URL = @import("../../URL.zig");
|
||||
return URL.resolve(page.call_arena, page.base(), poster, .{ .encode = true });
|
||||
return element.asConstNode().resolveURL(poster, page, .{});
|
||||
}
|
||||
|
||||
pub fn setPoster(self: *Video, value: []const u8, page: *Page) !void {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright (C) 2023-2025 Lightpanda (Selecy SAS)
|
||||
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
|
||||
//
|
||||
// Francis Bouvier <francis@lightpanda.io>
|
||||
// Pierre Tachoire <pierre@lightpanda.io>
|
||||
@@ -19,6 +19,7 @@
|
||||
const std = @import("std");
|
||||
const lp = @import("lightpanda");
|
||||
const js = @import("../../js/js.zig");
|
||||
const html5ever = @import("../../parser/html5ever.zig");
|
||||
|
||||
const Page = @import("../../Page.zig");
|
||||
const Session = @import("../../Session.zig");
|
||||
@@ -30,13 +31,11 @@ _rc: lp.RC(u8) = .{},
|
||||
_fatal: bool,
|
||||
_arena: Allocator,
|
||||
_ignore_bom: bool,
|
||||
_stream: std.ArrayList(u8),
|
||||
|
||||
const Label = enum {
|
||||
utf8,
|
||||
@"utf-8",
|
||||
@"unicode-1-1-utf-8",
|
||||
};
|
||||
_bom_seen: bool,
|
||||
_decoder: ?*anyopaque, // Persistent streaming decoder
|
||||
_encoding_handle: *anyopaque,
|
||||
_encoding_name: []const u8,
|
||||
_lowercase_name: []const u8, // Cached lowercase version of encoding name
|
||||
|
||||
const InitOpts = struct {
|
||||
fatal: bool = false,
|
||||
@@ -44,8 +43,17 @@ const InitOpts = struct {
|
||||
};
|
||||
|
||||
pub fn init(label_: ?[]const u8, opts_: ?InitOpts, page: *Page) !*TextDecoder {
|
||||
if (label_) |label| {
|
||||
_ = std.meta.stringToEnum(Label, label) orelse return error.RangeError;
|
||||
const label = label_ orelse "utf-8";
|
||||
|
||||
const info = html5ever.encoding_for_label(label.ptr, label.len);
|
||||
if (!info.isValid()) {
|
||||
return error.RangeError;
|
||||
}
|
||||
|
||||
// Check for "replacement" encoding - it's not usable for decoding per spec
|
||||
const enc_name = info.name();
|
||||
if (std.mem.eql(u8, enc_name, "replacement")) {
|
||||
return error.RangeError;
|
||||
}
|
||||
|
||||
const arena = try page.getArena(.{ .debug = "TextDecoder" });
|
||||
@@ -55,14 +63,21 @@ pub fn init(label_: ?[]const u8, opts_: ?InitOpts, page: *Page) !*TextDecoder {
|
||||
const self = try arena.create(TextDecoder);
|
||||
self.* = .{
|
||||
._arena = arena,
|
||||
._stream = .empty,
|
||||
._fatal = opts.fatal,
|
||||
._ignore_bom = opts.ignoreBOM,
|
||||
._encoding_handle = info.handle.?,
|
||||
._decoder = null,
|
||||
._bom_seen = false,
|
||||
._lowercase_name = "", // Will be lazily allocated
|
||||
._encoding_name = enc_name, // Points to static Rust memory
|
||||
};
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *TextDecoder, session: *Session) void {
|
||||
if (self._decoder) |decoder| {
|
||||
html5ever.encoding_decoder_free(decoder);
|
||||
}
|
||||
session.releaseArena(self._arena);
|
||||
}
|
||||
|
||||
@@ -82,34 +97,110 @@ pub fn getFatal(self: *const TextDecoder) bool {
|
||||
return self._fatal;
|
||||
}
|
||||
|
||||
pub fn getEncoding(self: *TextDecoder) ![]const u8 {
|
||||
// Spec requires lowercase encoding name
|
||||
// Allocate buffer for lowercase name on first access
|
||||
if (self._lowercase_name.len > 0) {
|
||||
return self._lowercase_name;
|
||||
}
|
||||
self._lowercase_name = try std.ascii.allocLowerString(self._arena, self._encoding_name);
|
||||
return self._lowercase_name;
|
||||
}
|
||||
|
||||
const DecodeOpts = struct {
|
||||
stream: bool = false,
|
||||
};
|
||||
|
||||
pub fn decode(self: *TextDecoder, input_: ?[]const u8, opts_: ?DecodeOpts) ![]const u8 {
|
||||
var input = input_ orelse return "";
|
||||
const opts: DecodeOpts = opts_ orelse .{};
|
||||
const input = input_ orelse "";
|
||||
|
||||
if (self._stream.items.len > 0) {
|
||||
try self._stream.appendSlice(self._arena, input);
|
||||
input = self._stream.items;
|
||||
}
|
||||
|
||||
if (self._fatal and !std.unicode.utf8ValidateSlice(input)) {
|
||||
if (opts.stream) {
|
||||
if (self._stream.items.len == 0) {
|
||||
try self._stream.appendSlice(self._arena, input);
|
||||
}
|
||||
return "";
|
||||
// For non-streaming calls, we don't need a persistent decoder
|
||||
if (!opts.stream) {
|
||||
// Reset decoder state if we had one
|
||||
if (self._decoder) |decoder| {
|
||||
html5ever.encoding_decoder_free(decoder);
|
||||
self._decoder = null;
|
||||
}
|
||||
} else if (self._decoder == null) {
|
||||
self._decoder = html5ever.encoding_decoder_new(self._encoding_handle);
|
||||
if (self._decoder == null) {
|
||||
return error.OutOfMemory;
|
||||
}
|
||||
return error.InvalidUtf8;
|
||||
}
|
||||
|
||||
self._stream.clearRetainingCapacity();
|
||||
if (self._ignore_bom == false and std.mem.startsWith(u8, input, &.{ 0xEF, 0xBB, 0xBF })) {
|
||||
return input[3..];
|
||||
return self._decode(input, self._decoder);
|
||||
}
|
||||
|
||||
fn _decode(self: *TextDecoder, input: []const u8, streaming_decoder: ?*anyopaque) ![]const u8 {
|
||||
if (input.len == 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return input;
|
||||
// Calculate max output size
|
||||
const max_out = html5ever.encoding_max_utf8_buffer_length(
|
||||
self._encoding_handle,
|
||||
input.len,
|
||||
);
|
||||
|
||||
if (max_out == 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Allocate output buffer
|
||||
const output = try self._arena.alloc(u8, max_out);
|
||||
|
||||
// Decode using either streaming or one-shot decoder
|
||||
const result = if (streaming_decoder) |decoder|
|
||||
html5ever.encoding_decoder_decode(
|
||||
decoder,
|
||||
input.ptr,
|
||||
input.len,
|
||||
output.ptr,
|
||||
output.len,
|
||||
0, // is_last = false for streaming
|
||||
)
|
||||
else
|
||||
html5ever.encoding_decode(
|
||||
self._encoding_handle,
|
||||
input.ptr,
|
||||
input.len,
|
||||
output.ptr,
|
||||
output.len,
|
||||
1, // is_last = true for one-shot
|
||||
);
|
||||
|
||||
// Handle errors in fatal mode
|
||||
if (self._fatal and result.hadErrors()) {
|
||||
if (streaming_decoder != null) {
|
||||
// Reset decoder on error
|
||||
if (self._decoder) |decoder| {
|
||||
html5ever.encoding_decoder_free(decoder);
|
||||
self._decoder = null;
|
||||
}
|
||||
}
|
||||
self._bom_seen = false;
|
||||
return error.TypeError;
|
||||
}
|
||||
|
||||
var decoded: []const u8 = output[0..result.bytes_written];
|
||||
|
||||
// Handle BOM stripping
|
||||
if (!self._bom_seen and !self._ignore_bom) {
|
||||
decoded = stripBom(decoded);
|
||||
self._bom_seen = true;
|
||||
}
|
||||
|
||||
return decoded;
|
||||
}
|
||||
|
||||
fn stripBom(data: []const u8) []const u8 {
|
||||
// UTF-8 BOM in decoded output appears as U+FEFF (EF BB BF in UTF-8)
|
||||
const bom = "\u{FEFF}";
|
||||
if (std.mem.startsWith(u8, data, bom)) {
|
||||
return data[bom.len..];
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
pub const JsApi = struct {
|
||||
@@ -123,7 +214,7 @@ pub const JsApi = struct {
|
||||
|
||||
pub const constructor = bridge.constructor(TextDecoder.init, .{});
|
||||
pub const decode = bridge.function(TextDecoder.decode, .{});
|
||||
pub const encoding = bridge.property("utf-8", .{ .template = false });
|
||||
pub const encoding = bridge.accessor(TextDecoder.getEncoding, null, .{});
|
||||
pub const fatal = bridge.accessor(TextDecoder.getFatal, null, .{});
|
||||
pub const ignoreBOM = bridge.accessor(TextDecoder.getIgnoreBOM, null, .{});
|
||||
};
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright (C) 2023-2025 Lightpanda (Selecy SAS)
|
||||
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
|
||||
//
|
||||
// Francis Bouvier <francis@lightpanda.io>
|
||||
// Pierre Tachoire <pierre@lightpanda.io>
|
||||
@@ -26,12 +26,23 @@ pub fn init() TextEncoder {
|
||||
return .{};
|
||||
}
|
||||
|
||||
pub fn encode(_: *const TextEncoder, v: []const u8) !js.TypedArray(u8) {
|
||||
if (!std.unicode.utf8ValidateSlice(v)) {
|
||||
pub fn encode(_: *const TextEncoder, v_: ?js.Value) !js.TypedArray(u8) {
|
||||
const v = v_ orelse return .{ .values = "" };
|
||||
|
||||
if (v.isUndefined()) {
|
||||
return .{ .values = "" };
|
||||
}
|
||||
|
||||
if (v.isNull()) {
|
||||
return .{ .values = "null" };
|
||||
}
|
||||
|
||||
const str = try v.toStringSlice();
|
||||
if (!std.unicode.utf8ValidateSlice(str)) {
|
||||
return error.InvalidUtf8;
|
||||
}
|
||||
|
||||
return .{ .values = v };
|
||||
return .{ .values = str };
|
||||
}
|
||||
|
||||
pub const JsApi = struct {
|
||||
|
||||
@@ -108,7 +108,7 @@ pub fn init(url: []const u8, protocols_: ?[]const u8, page: *Page) !*WebSocket {
|
||||
const arena = try page.getArena(.{ .debug = "WebSocket" });
|
||||
errdefer page.releaseArena(arena);
|
||||
|
||||
const resolved_url = try URL.resolve(arena, page.base(), url, .{ .always_dupe = true, .encode = true });
|
||||
const resolved_url = try URL.resolve(arena, page.base(), url, .{ .always_dupe = true, .encoding = page.charset });
|
||||
|
||||
const http_client = page._session.browser.http_client;
|
||||
const conn = http_client.network.newConnection() orelse {
|
||||
|
||||
@@ -210,7 +210,7 @@ pub fn open(self: *XMLHttpRequest, method_: []const u8, url: [:0]const u8) !void
|
||||
|
||||
const page = self._page;
|
||||
self._method = try parseMethod(method_);
|
||||
self._url = try URL.resolve(self._arena, page.base(), url, .{ .always_dupe = true, .encode = true });
|
||||
self._url = try URL.resolve(self._arena, page.base(), url, .{ .always_dupe = true, .encoding = page.charset });
|
||||
try self.stateChanged(.opened, page);
|
||||
}
|
||||
|
||||
|
||||
@@ -286,7 +286,7 @@ fn navigate(cmd: *CDP.Command) !void {
|
||||
page = try session.replacePage();
|
||||
}
|
||||
|
||||
const encoded_url = try URL.ensureEncoded(page.call_arena, params.url);
|
||||
const encoded_url = try URL.ensureEncoded(page.call_arena, params.url, "UTF-8");
|
||||
try page.navigate(encoded_url, .{
|
||||
.reason = .address_bar,
|
||||
.cdp_id = cmd.input.id,
|
||||
|
||||
@@ -220,7 +220,7 @@ fn createTarget(cmd: *CDP.Command) !void {
|
||||
}
|
||||
|
||||
if (!std.mem.eql(u8, "about:blank", params.url)) {
|
||||
const encoded_url = try URL.ensureEncoded(page.call_arena, params.url);
|
||||
const encoded_url = try URL.ensureEncoded(page.call_arena, params.url, "UTF-8");
|
||||
try page.navigate(
|
||||
encoded_url,
|
||||
.{ .reason = .address_bar, .kind = .{ .push = null } },
|
||||
|
||||
@@ -151,6 +151,303 @@ pub extern "C" fn html5ever_parse_document_with_encoding(
|
||||
.one(StrTendril::from(decoded.as_ref()));
|
||||
}
|
||||
|
||||
// === Encoding API for TextDecoder ===
|
||||
|
||||
/// Result of encoding label lookup
|
||||
#[repr(C)]
|
||||
pub struct EncodingInfo {
|
||||
/// 0 = not found, 1 = found
|
||||
pub found: u8,
|
||||
/// Opaque handle to the encoding (actually &'static Encoding)
|
||||
pub handle: *const c_void,
|
||||
/// Length of canonical name
|
||||
pub name_len: usize,
|
||||
/// Pointer to canonical encoding name (static, lowercase)
|
||||
pub name_ptr: *const c_uchar,
|
||||
}
|
||||
|
||||
/// Look up an encoding by its label (case-insensitive, whitespace-trimmed)
|
||||
#[no_mangle]
|
||||
pub extern "C" fn encoding_for_label(
|
||||
label: *const c_uchar,
|
||||
label_len: usize,
|
||||
) -> EncodingInfo {
|
||||
if label.is_null() || label_len == 0 {
|
||||
return EncodingInfo {
|
||||
found: 0,
|
||||
name_len: 0,
|
||||
handle: std::ptr::null(),
|
||||
name_ptr: std::ptr::null(),
|
||||
};
|
||||
}
|
||||
|
||||
let label_bytes = unsafe { std::slice::from_raw_parts(label, label_len) };
|
||||
|
||||
match Encoding::for_label(label_bytes) {
|
||||
Some(encoding) => {
|
||||
let name = encoding.name();
|
||||
EncodingInfo {
|
||||
found: 1,
|
||||
name_len: name.len(),
|
||||
name_ptr: name.as_ptr(),
|
||||
handle: encoding as *const _ as *const c_void,
|
||||
}
|
||||
}
|
||||
None => EncodingInfo {
|
||||
found: 0,
|
||||
name_len: 0,
|
||||
name_ptr: std::ptr::null(),
|
||||
handle: std::ptr::null(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate maximum UTF-8 buffer size needed for decoding
|
||||
#[no_mangle]
|
||||
pub extern "C" fn encoding_max_utf8_buffer_length(
|
||||
handle: *const c_void,
|
||||
input_len: usize,
|
||||
) -> usize {
|
||||
if handle.is_null() {
|
||||
return 0;
|
||||
}
|
||||
let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
|
||||
let decoder = encoding.new_decoder();
|
||||
decoder.max_utf8_buffer_length(input_len).unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Result of decoding operation
|
||||
#[repr(C)]
|
||||
pub struct DecodeResult {
|
||||
/// 0 = no errors, 1 = had malformed sequences (replaced with U+FFFD)
|
||||
pub had_errors: u8,
|
||||
/// Number of input bytes consumed
|
||||
pub bytes_read: usize,
|
||||
/// Number of UTF-8 bytes written to output buffer
|
||||
pub bytes_written: usize,
|
||||
}
|
||||
|
||||
/// Decode bytes from source encoding to UTF-8
|
||||
/// For streaming, set is_last=0; for final/complete decode, set is_last=1
|
||||
#[no_mangle]
|
||||
pub extern "C" fn encoding_decode(
|
||||
handle: *const c_void,
|
||||
input: *const c_uchar,
|
||||
input_len: usize,
|
||||
output: *mut c_uchar,
|
||||
output_len: usize,
|
||||
is_last: u8,
|
||||
) -> DecodeResult {
|
||||
if handle.is_null() || output.is_null() {
|
||||
return DecodeResult {
|
||||
had_errors: 1,
|
||||
bytes_read: 0,
|
||||
bytes_written: 0,
|
||||
};
|
||||
}
|
||||
|
||||
let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
|
||||
let input_bytes = if input.is_null() || input_len == 0 {
|
||||
&[]
|
||||
} else {
|
||||
unsafe { std::slice::from_raw_parts(input, input_len) }
|
||||
};
|
||||
let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_len) };
|
||||
|
||||
let mut decoder = encoding.new_decoder();
|
||||
let last = is_last != 0;
|
||||
|
||||
let (result, bytes_read, bytes_written, had_errors) =
|
||||
decoder.decode_to_utf8(input_bytes, output_slice, last);
|
||||
|
||||
// If output buffer was too small, we still report what we could process
|
||||
let _ = result; // CoderResult::InputEmpty or CoderResult::OutputFull
|
||||
|
||||
DecodeResult {
|
||||
had_errors: if had_errors { 1 } else { 0 },
|
||||
bytes_read,
|
||||
bytes_written,
|
||||
}
|
||||
}
|
||||
|
||||
// === Streaming Decoder API ===
|
||||
|
||||
use encoding_rs::Decoder;
|
||||
|
||||
/// Create a streaming decoder that maintains state across calls
|
||||
#[no_mangle]
|
||||
pub extern "C" fn encoding_decoder_new(handle: *const c_void) -> *mut c_void {
|
||||
if handle.is_null() {
|
||||
return std::ptr::null_mut();
|
||||
}
|
||||
let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
|
||||
let decoder = Box::new(encoding.new_decoder());
|
||||
Box::into_raw(decoder) as *mut c_void
|
||||
}
|
||||
|
||||
/// Decode using a streaming decoder (maintains state for incomplete sequences)
|
||||
#[no_mangle]
|
||||
pub extern "C" fn encoding_decoder_decode(
|
||||
decoder_ptr: *mut c_void,
|
||||
input: *const c_uchar,
|
||||
input_len: usize,
|
||||
output: *mut c_uchar,
|
||||
output_len: usize,
|
||||
is_last: u8,
|
||||
) -> DecodeResult {
|
||||
if decoder_ptr.is_null() || output.is_null() {
|
||||
return DecodeResult {
|
||||
had_errors: 1,
|
||||
bytes_read: 0,
|
||||
bytes_written: 0,
|
||||
};
|
||||
}
|
||||
|
||||
let decoder: &mut Decoder = unsafe { &mut *(decoder_ptr as *mut Decoder) };
|
||||
let input_bytes = if input.is_null() || input_len == 0 {
|
||||
&[]
|
||||
} else {
|
||||
unsafe { std::slice::from_raw_parts(input, input_len) }
|
||||
};
|
||||
let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_len) };
|
||||
|
||||
let last = is_last != 0;
|
||||
let (result, bytes_read, bytes_written, had_errors) =
|
||||
decoder.decode_to_utf8(input_bytes, output_slice, last);
|
||||
|
||||
let _ = result;
|
||||
|
||||
DecodeResult {
|
||||
had_errors: if had_errors { 1 } else { 0 },
|
||||
bytes_read,
|
||||
bytes_written,
|
||||
}
|
||||
}
|
||||
|
||||
/// Free a streaming decoder
|
||||
#[no_mangle]
|
||||
pub extern "C" fn encoding_decoder_free(decoder_ptr: *mut c_void) {
|
||||
if !decoder_ptr.is_null() {
|
||||
unsafe {
|
||||
drop(Box::from_raw(decoder_ptr as *mut Decoder));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// === Encoding API (UTF-8 to legacy encoding with NCR fallback) ===
|
||||
|
||||
/// Result of encoding operation
|
||||
#[repr(C)]
|
||||
pub struct EncodeResult {
|
||||
/// 0 = success, 1 = output buffer too small
|
||||
pub status: u8,
|
||||
/// Number of input bytes consumed
|
||||
pub bytes_read: usize,
|
||||
/// Number of bytes written to output buffer
|
||||
pub bytes_written: usize,
|
||||
}
|
||||
|
||||
/// Encode UTF-8 to a legacy encoding, replacing unencodable characters with
|
||||
/// HTML decimal numeric character references (&#codepoint;).
|
||||
///
|
||||
/// This is used for URL query string encoding per WHATWG URL spec.
|
||||
/// encoding_rs's encode_from_utf8 already produces NCRs for unmappable chars.
|
||||
#[no_mangle]
|
||||
pub extern "C" fn encoding_encode_with_ncr(
|
||||
handle: *const c_void,
|
||||
input: *const c_uchar,
|
||||
input_len: usize,
|
||||
output: *mut c_uchar,
|
||||
output_capacity: usize,
|
||||
) -> EncodeResult {
|
||||
if handle.is_null() || output.is_null() {
|
||||
return EncodeResult {
|
||||
status: 1,
|
||||
bytes_read: 0,
|
||||
bytes_written: 0,
|
||||
};
|
||||
}
|
||||
|
||||
let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
|
||||
|
||||
let input_str = if input.is_null() || input_len == 0 {
|
||||
""
|
||||
} else {
|
||||
let bytes = unsafe { std::slice::from_raw_parts(input, input_len) };
|
||||
match std::str::from_utf8(bytes) {
|
||||
Ok(s) => s,
|
||||
Err(_) => {
|
||||
return EncodeResult {
|
||||
status: 1,
|
||||
bytes_read: 0,
|
||||
bytes_written: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// For UTF-8 encoding, just copy directly (no NCR needed)
|
||||
if encoding == encoding_rs::UTF_8 {
|
||||
if input_len > output_capacity {
|
||||
return EncodeResult {
|
||||
bytes_read: 0,
|
||||
bytes_written: 0,
|
||||
status: 1,
|
||||
};
|
||||
}
|
||||
let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) };
|
||||
output_slice[..input_len].copy_from_slice(input_str.as_bytes());
|
||||
return EncodeResult {
|
||||
bytes_read: input_len,
|
||||
bytes_written: input_len,
|
||||
status: 0,
|
||||
};
|
||||
}
|
||||
|
||||
let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) };
|
||||
let mut encoder = encoding.new_encoder();
|
||||
|
||||
// encode_from_utf8 automatically produces NCRs for unmappable characters
|
||||
let (result, bytes_read, bytes_written, _had_unmappables) =
|
||||
encoder.encode_from_utf8(input_str, output_slice, true);
|
||||
|
||||
match result {
|
||||
encoding_rs::CoderResult::InputEmpty => EncodeResult {
|
||||
bytes_read,
|
||||
bytes_written,
|
||||
status: 0,
|
||||
},
|
||||
encoding_rs::CoderResult::OutputFull => EncodeResult {
|
||||
bytes_read,
|
||||
bytes_written,
|
||||
status: 1,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate maximum output buffer size needed for encoding with NCR fallback.
|
||||
/// Worst case: every character becomes &#codepoint; where codepoint is up to 7 digits.
|
||||
#[no_mangle]
|
||||
pub extern "C" fn encoding_max_encode_buffer_length(
|
||||
handle: *const c_void,
|
||||
input_len: usize,
|
||||
) -> usize {
|
||||
if handle.is_null() {
|
||||
return 0;
|
||||
}
|
||||
let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) };
|
||||
let encoder = encoding.new_encoder();
|
||||
// This returns the max buffer size accounting for NCR expansion
|
||||
encoder
|
||||
.max_buffer_length_from_utf8_if_no_unmappables(input_len)
|
||||
.map(|len| {
|
||||
// Add extra space for potential NCRs (each char could become &#nnnnnn; = 10 bytes)
|
||||
// But realistically, most chars are mappable, so add 2x as safety margin
|
||||
len.saturating_mul(2)
|
||||
})
|
||||
.unwrap_or(input_len * 10)
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn html5ever_parse_fragment(
|
||||
html: *mut c_uchar,
|
||||
|
||||
@@ -107,7 +107,7 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void {
|
||||
// }
|
||||
// }
|
||||
|
||||
const encoded_url = try URL.ensureEncoded(page.call_arena, url);
|
||||
const encoded_url = try URL.ensureEncoded(page.call_arena, url, "UTF-8");
|
||||
_ = try page.navigate(encoded_url, .{
|
||||
.reason = .address_bar,
|
||||
.kind = .{ .push = null },
|
||||
|
||||
Reference in New Issue
Block a user