mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-06-11 01:25:53 -04:00
Initial idn support
Links to libidn2 and builds libcurl with it. This makes libcurl work, and by extension browser, work on international domain names, e.g. zig build run -- fetch "https://räksmörgås.se/" With it available, we can use it in our WebAPIs which should also support these domains, e.g: testing.expectEqual('xn--rksmrgs-5wao1o.se', new URL('https://räksmörgås.se').hostname); There is more integration to be done here, but this is a first step. claude wrote all of the build.zig code. I don't have a strong opinion about this feature, I just dislike that our WPT /url/* tests are at 1704 / 9095 and, this is the biggest chunk (although, this specific commit just does the basic integration and probably won't fix too many WPT cases directly).
This commit is contained in:
160
build.zig
160
build.zig
@@ -342,6 +342,9 @@ fn linkCurl(b: *Build, mod: *Build.Module, is_tsan: bool) !void {
|
||||
const boringssl = buildBoringSsl(b, target, mod.optimize.?);
|
||||
for (boringssl) |lib| curl.root_module.linkLibrary(lib);
|
||||
|
||||
const libidn2 = buildLibidn2(b, target, mod.optimize.?, is_tsan);
|
||||
curl.root_module.linkLibrary(libidn2);
|
||||
|
||||
switch (target.result.os.tag) {
|
||||
.macos => {
|
||||
// needed for proxying on mac
|
||||
@@ -496,6 +499,158 @@ fn buildNghttp2(b: *Build, target: Build.ResolvedTarget, optimize: std.builtin.O
|
||||
return lib;
|
||||
}
|
||||
|
||||
fn buildLibidn2(
|
||||
b: *Build,
|
||||
target: Build.ResolvedTarget,
|
||||
optimize: std.builtin.OptimizeMode,
|
||||
is_tsan: bool,
|
||||
) *Build.Step.Compile {
|
||||
const dep = b.dependency("libidn2", .{});
|
||||
|
||||
const os = target.result.os.tag;
|
||||
const is_darwin = os.isDarwin();
|
||||
|
||||
const mod = b.createModule(.{
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
.link_libc = true,
|
||||
.sanitize_thread = is_tsan,
|
||||
});
|
||||
|
||||
// libidn2's autoconf+gnulib stack expects a config.h with hundreds of
|
||||
// HAVE_*/_GL_ATTRIBUTE_* defines — including ~800 lines of attribute-
|
||||
// detection macros emitted from gnulib-common.m4 via AH_VERBATIM. We
|
||||
// vendor a single autoconf-generated config.h rather than try to
|
||||
// reproduce that machinery in the Zig build system.
|
||||
mod.addIncludePath(b.path("vendor/libidn2"));
|
||||
|
||||
// Substitute the gnulib-style .in.h templates. All @VAR@ in them are
|
||||
// either DLL-visibility markers (empty for static POSIX) or
|
||||
// HAVE_UNISTRING_WOE32DLL_H (0).
|
||||
inline for (.{ "unitypes", "unistr", "uniconv", "unictype", "uninorm" }) |name| {
|
||||
mod.addConfigHeader(renderUnistringHeader(b, dep, name));
|
||||
}
|
||||
|
||||
mod.addIncludePath(dep.path("lib"));
|
||||
mod.addIncludePath(dep.path("unistring"));
|
||||
// gl/ holds gnulib helpers — only malloca and version-etc headers are
|
||||
// referenced from the sources we compile; we don't need the full gl/ shim
|
||||
// layer (system header replacements).
|
||||
mod.addIncludePath(dep.path("gl"));
|
||||
|
||||
const lib = b.addLibrary(.{ .name = "idn2", .root_module = mod });
|
||||
lib.installHeader(dep.path("lib/idn2.h"), "idn2.h");
|
||||
|
||||
if (is_darwin) {
|
||||
// unistring's striconveh.c calls real iconv_*, which on macOS lives
|
||||
// in libiconv (separate from libSystem). On glibc Linux iconv is in
|
||||
// libc itself; on musl it would also need a separate -liconv.
|
||||
mod.linkSystemLibrary("iconv", .{});
|
||||
}
|
||||
|
||||
lib.addCSourceFiles(.{
|
||||
.root = dep.path("lib"),
|
||||
.flags = &.{ "-DHAVE_CONFIG_H", "-DIDN2_STATIC" },
|
||||
.files = &.{
|
||||
"bidi.c", "context.c", "data.c", "decode.c",
|
||||
"error.c", "free.c", "idna.c", "lookup.c",
|
||||
"punycode.c", "register.c", "tables.c", "tr46map.c",
|
||||
"version.c",
|
||||
},
|
||||
});
|
||||
lib.addCSourceFiles(.{
|
||||
.root = dep.path("gl"),
|
||||
.flags = &.{"-DHAVE_CONFIG_H"},
|
||||
// malloca.c provides striconveha's stack-or-heap allocator; strverscmp
|
||||
// is a glibc extension absent on macOS that lib/version.c needs.
|
||||
.files = &.{ "malloca.c", "strverscmp.c" },
|
||||
});
|
||||
lib.addCSourceFiles(.{
|
||||
.root = dep.path("unistring"),
|
||||
.flags = &.{"-DHAVE_CONFIG_H"},
|
||||
.files = &.{
|
||||
"c-ctype.c", "c-strcasecmp.c", "c-strncasecmp.c",
|
||||
"free.c", "iconv.c", "iconv_close.c",
|
||||
"iconv_open.c", "localcharset.c", "stdlib.c",
|
||||
"striconveh.c", "striconveha.c", "unistd.c",
|
||||
"uniconv/u8-conv-from-enc.c", "uniconv/u8-strconv-from-enc.c", "uniconv/u8-strconv-from-locale.c",
|
||||
"uniconv/u8-strconv-to-enc.c", "uniconv/u8-strconv-to-locale.c", "unictype/bidi_of.c",
|
||||
"unictype/categ_M.c", "unictype/categ_none.c", "unictype/categ_of.c",
|
||||
"unictype/categ_test.c", "unictype/combiningclass.c", "unictype/joiningtype_of.c",
|
||||
"unictype/scripts.c", "uninorm/canonical-decomposition.c", "uninorm/composition.c",
|
||||
"uninorm/decompose-internal.c", "uninorm/decomposition-table.c", "uninorm/nfc.c",
|
||||
"uninorm/nfd.c", "uninorm/u32-normalize.c", "unistr/u32-cmp.c",
|
||||
"unistr/u32-cpy-alloc.c", "unistr/u32-cpy.c", "unistr/u32-mbtouc-unsafe.c",
|
||||
"unistr/u32-strlen.c", "unistr/u32-to-u8.c", "unistr/u32-uctomb.c",
|
||||
"unistr/u8-check.c", "unistr/u8-mblen.c", "unistr/u8-mbtouc.c",
|
||||
"unistr/u8-mbtouc-aux.c", "unistr/u8-mbtouc-unsafe.c", "unistr/u8-mbtouc-unsafe-aux.c",
|
||||
"unistr/u8-mbtoucr.c", "unistr/u8-prev.c", "unistr/u8-strlen.c",
|
||||
"unistr/u8-to-u32.c", "unistr/u8-uctomb.c", "unistr/u8-uctomb-aux.c",
|
||||
},
|
||||
});
|
||||
|
||||
return lib;
|
||||
}
|
||||
|
||||
/// Process one of unistring's `.in.h` template headers into a real `.h`.
|
||||
/// All `@VAR@` substitutions in these headers are either DLL-visibility markers
|
||||
/// (empty for static POSIX builds) or `HAVE_UNISTRING_WOE32DLL_H` (0).
|
||||
fn renderUnistringHeader(b: *Build, dep: *Build.Dependency, name: []const u8) *Build.Step.ConfigHeader {
|
||||
const in_rel = b.fmt("unistring/{s}.in.h", .{name});
|
||||
const out_name = b.fmt("{s}.h", .{name});
|
||||
const lazy = dep.path(in_rel);
|
||||
const path = lazy.getPath3(b, null);
|
||||
|
||||
const file = path.root_dir.handle.openFile(path.sub_path, .{}) catch |e| {
|
||||
std.debug.panic("openFile {s}: {s}", .{ path.sub_path, @errorName(e) });
|
||||
};
|
||||
defer file.close();
|
||||
const contents = file.readToEndAlloc(b.allocator, 4 << 20) catch @panic("OOM");
|
||||
|
||||
const ch = b.addConfigHeader(.{
|
||||
.include_path = out_name,
|
||||
.style = .{ .autoconf_at = lazy },
|
||||
}, .{});
|
||||
|
||||
var seen = std.StringHashMap(void).init(b.allocator);
|
||||
var i: usize = 0;
|
||||
while (std.mem.indexOfScalarPos(u8, contents, i, '@')) |s| {
|
||||
const a = s + 1;
|
||||
const e = std.mem.indexOfScalarPos(u8, contents, a, '@') orelse break;
|
||||
const var_name = contents[a..e];
|
||||
if (!isAtConfigName(var_name)) {
|
||||
// Stray '@' (e.g. an email address in a comment); advance past it
|
||||
// alone so we don't mis-pair with a later '@'.
|
||||
i = s + 1;
|
||||
continue;
|
||||
}
|
||||
const owned = b.allocator.dupe(u8, var_name) catch @panic("OOM");
|
||||
const gop = seen.getOrPut(owned) catch @panic("OOM");
|
||||
if (!gop.found_existing) {
|
||||
if (std.mem.eql(u8, var_name, "HAVE_UNISTRING_WOE32DLL_H")) {
|
||||
ch.addValue(owned, c_int, 0);
|
||||
} else {
|
||||
ch.addValue(owned, []const u8, "");
|
||||
}
|
||||
}
|
||||
i = e + 1;
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
|
||||
fn isAtConfigName(s: []const u8) bool {
|
||||
if (s.len == 0) return false;
|
||||
for (s, 0..) |c, idx| {
|
||||
const ok = switch (c) {
|
||||
'A'...'Z', '_' => true,
|
||||
'0'...'9' => idx > 0,
|
||||
else => false,
|
||||
};
|
||||
if (!ok) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
fn buildCurl(
|
||||
b: *Build,
|
||||
target: Build.ResolvedTarget,
|
||||
@@ -572,6 +727,11 @@ fn buildCurl(
|
||||
._FILE_OFFSET_BITS = 64,
|
||||
|
||||
.USE_IPV6 = true,
|
||||
// Route IDN hostnames through libidn2 (vendored, see buildLibidn2).
|
||||
// Without this, libcurl ships UTF-8 host bytes to SNI/cert validation
|
||||
// and breaks for non-ASCII hostnames like räksmörgås.se.
|
||||
.HAVE_LIBIDN2 = true,
|
||||
.HAVE_IDN2_H = true,
|
||||
.CURL_OS = switch (os) {
|
||||
.linux => if (is_android) "\"android\"" else "\"linux\"",
|
||||
else => std.fmt.allocPrint(b.allocator, "\"{s}\"", .{@tagName(os)}) catch @panic("OOM"),
|
||||
|
||||
@@ -34,6 +34,10 @@
|
||||
.url = "https://github.com/allyourcodebase/sqlite3/archive/8f840560eae88ab66668c6827c64ffbd0d74ef37.tar.gz",
|
||||
.hash = "sqlite3-3.51.0-DMxLWssOAABZ8cAvU_LfBIbp0kZjm824PU8sSLXpEDdr",
|
||||
},
|
||||
.libidn2 = .{
|
||||
.url = "https://ftp.gnu.org/gnu/libidn/libidn2-2.3.8.tar.gz",
|
||||
.hash = "N-V-__8AABGOuAC_dhAN07kfoP4dycCFi8Bka4O-tuhriNH8",
|
||||
},
|
||||
},
|
||||
.paths = .{""},
|
||||
}
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
const std = @import("std");
|
||||
const idna = @import("../sys/idna.zig");
|
||||
|
||||
const Allocator = std.mem.Allocator;
|
||||
|
||||
pub const ResolveOpts = struct {
|
||||
@@ -190,11 +192,35 @@ pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, o
|
||||
}
|
||||
|
||||
fn processResolved(allocator: Allocator, url: [:0]const u8, opts: ResolveOpts) ![:0]const u8 {
|
||||
const encoding = opts.encoding orelse return url;
|
||||
const encoding = opts.encoding orelse return ensureHostAscii(allocator, url);
|
||||
return ensureEncoded(allocator, url, encoding);
|
||||
}
|
||||
|
||||
pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8, encoding: []const u8) ![:0]const u8 {
|
||||
/// IDNA-only pass: converts a non-ASCII host (`räksmörgås.se`) to its
|
||||
/// punycode form (`xn--rksmrgs-5wao1o.se`) and leaves everything else alone.
|
||||
fn ensureHostAscii(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
|
||||
const hostname = getHostname(url);
|
||||
if (hostname.len == 0 or !idna.needsAscii(hostname)) {
|
||||
return url;
|
||||
}
|
||||
|
||||
const ascii = try idna.toAscii(allocator, hostname);
|
||||
|
||||
// hostname is a slice of url, so its start offset is just pointer arithmetic.
|
||||
const start = @intFromPtr(hostname.ptr) - @intFromPtr(url.ptr);
|
||||
const end = start + hostname.len;
|
||||
var buf = try std.ArrayList(u8).initCapacity(allocator, url.len - hostname.len + ascii.len + 1);
|
||||
buf.appendSliceAssumeCapacity(url[0..start]);
|
||||
buf.appendSliceAssumeCapacity(ascii);
|
||||
buf.appendSliceAssumeCapacity(url[end..]);
|
||||
buf.appendAssumeCapacity(0);
|
||||
return buf.items[0 .. buf.items.len - 1 :0];
|
||||
}
|
||||
|
||||
pub fn ensureEncoded(allocator: Allocator, url_in: [:0]const u8, encoding: []const u8) ![:0]const u8 {
|
||||
// Resolve any IDN host first; everything below operates on the ASCII form.
|
||||
const url = try ensureHostAscii(allocator, url_in);
|
||||
|
||||
const scheme_end = std.mem.indexOf(u8, url, "://");
|
||||
const authority_start = if (scheme_end) |end| end + 3 else 0;
|
||||
const path_start = std.mem.indexOfScalarPos(u8, url, authority_start, '/') orelse return url;
|
||||
|
||||
@@ -871,3 +871,52 @@
|
||||
testing.expectEqual('', url.search);
|
||||
}
|
||||
</script>
|
||||
|
||||
<script id=idna>
|
||||
// WHATWG "domain to ASCII": non-ASCII hosts are converted to punycode at
|
||||
// parse time, so getters always return the ASCII form.
|
||||
{
|
||||
const url = new URL('https://räksmörgås.se/');
|
||||
testing.expectEqual('xn--rksmrgs-5wao1o.se', url.hostname);
|
||||
testing.expectEqual('xn--rksmrgs-5wao1o.se', url.host);
|
||||
testing.expectEqual('https://xn--rksmrgs-5wao1o.se/', url.href);
|
||||
}
|
||||
|
||||
// UTS#46 non-transitional processing preserves ß rather than mapping to ss.
|
||||
{
|
||||
const url = new URL('https://faß.de/');
|
||||
testing.expectEqual('xn--fa-hia.de', url.hostname);
|
||||
}
|
||||
|
||||
// Pure-ASCII hosts must not be touched.
|
||||
{
|
||||
const url = new URL('https://example.com/');
|
||||
testing.expectEqual('example.com', url.hostname);
|
||||
testing.expectEqual('https://example.com/', url.href);
|
||||
}
|
||||
|
||||
// IDN preserved alongside port, userinfo, path, query, and fragment.
|
||||
{
|
||||
const url = new URL('https://räksmörgås.se:8443/p?q=1#h');
|
||||
testing.expectEqual('xn--rksmrgs-5wao1o.se', url.hostname);
|
||||
testing.expectEqual('xn--rksmrgs-5wao1o.se:8443', url.host);
|
||||
testing.expectEqual('8443', url.port);
|
||||
testing.expectEqual('/p', url.pathname);
|
||||
testing.expectEqual('?q=1', url.search);
|
||||
testing.expectEqual('#h', url.hash);
|
||||
}
|
||||
|
||||
{
|
||||
const url = new URL('https://user:pass@räksmörgås.se/');
|
||||
testing.expectEqual('xn--rksmrgs-5wao1o.se', url.hostname);
|
||||
testing.expectEqual('user', url.username);
|
||||
testing.expectEqual('pass', url.password);
|
||||
}
|
||||
|
||||
// Resolving a relative path against an IDN base preserves the punycode host.
|
||||
{
|
||||
const url = new URL('/about', 'https://räksmörgås.se/');
|
||||
testing.expectEqual('xn--rksmrgs-5wao1o.se', url.hostname);
|
||||
testing.expectEqual('https://xn--rksmrgs-5wao1o.se/about', url.href);
|
||||
}
|
||||
</script>
|
||||
|
||||
@@ -763,6 +763,7 @@ const CloneError = error{
|
||||
NotImplemented,
|
||||
InvalidCharacterError,
|
||||
CloneError,
|
||||
Idna,
|
||||
IFrameLoadError,
|
||||
TooManyContexts,
|
||||
LinkLoadError,
|
||||
|
||||
76
src/sys/idna.zig
Normal file
76
src/sys/idna.zig
Normal file
@@ -0,0 +1,76 @@
|
||||
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
const std = @import("std");
|
||||
|
||||
const c = @cImport({
|
||||
@cInclude("idn2.h");
|
||||
});
|
||||
|
||||
const Allocator = std.mem.Allocator;
|
||||
pub const Error = error{Idna} || Allocator.Error;
|
||||
|
||||
/// True if `host` contains any non-ASCII byte and therefore needs IDNA
|
||||
/// processing. Pure-ASCII hostnames are returned unchanged by `toAscii`,
|
||||
/// so callers can use this as a fast path to skip the C call entirely.
|
||||
pub fn needsAscii(host: []const u8) bool {
|
||||
for (host) |byte| {
|
||||
if (byte >= 0x80) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Convert a UTF-8 hostname to its ASCII (Punycode) form per UTS#46
|
||||
/// IDNA 2008 with non-transitional processing — the algorithm WHATWG URL
|
||||
/// invokes as "domain to ASCII". Returns an allocator-owned slice.
|
||||
pub fn toAscii(allocator: Allocator, host: []const u8) Error![]u8 {
|
||||
const host_z = try allocator.dupeZ(u8, host);
|
||||
defer allocator.free(host_z);
|
||||
|
||||
var out_ptr: [*c]u8 = undefined;
|
||||
const flags: c_int = c.IDN2_NFC_INPUT | c.IDN2_NONTRANSITIONAL;
|
||||
const rc = c.idn2_to_ascii_8z(host_z.ptr, &out_ptr, flags);
|
||||
if (rc != c.IDN2_OK) {
|
||||
return error.Idna;
|
||||
}
|
||||
defer c.idn2_free(out_ptr);
|
||||
|
||||
return try allocator.dupe(u8, std.mem.span(@as([*:0]const u8, @ptrCast(out_ptr))));
|
||||
}
|
||||
|
||||
const testing = @import("../testing.zig");
|
||||
|
||||
test "idna: ASCII passthrough" {
|
||||
try testing.expectEqual(false, needsAscii("example.com"));
|
||||
const out = try toAscii(testing.allocator, "example.com");
|
||||
defer testing.allocator.free(out);
|
||||
try testing.expectString("example.com", out);
|
||||
}
|
||||
|
||||
test "idna: non-ASCII to punycode" {
|
||||
try testing.expectEqual(true, needsAscii("räksmörgås.se"));
|
||||
const out = try toAscii(testing.allocator, "räksmörgås.se");
|
||||
defer testing.allocator.free(out);
|
||||
try testing.expectString("xn--rksmrgs-5wao1o.se", out);
|
||||
}
|
||||
|
||||
test "idna: German sharp s with non-transitional processing" {
|
||||
// UTS#46 non-transitional preserves ß rather than mapping to ss.
|
||||
const out = try toAscii(testing.allocator, "faß.de");
|
||||
defer testing.allocator.free(out);
|
||||
try testing.expectString("xn--fa-hia.de", out);
|
||||
}
|
||||
1907
vendor/libidn2/config.h
vendored
Normal file
1907
vendor/libidn2/config.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user