Initial idn support

Links to libidn2 and builds libcurl with it. This makes libcurl work, and by
extension browser, work on international domain names, e.g.

zig build run -- fetch "https://räksmörgås.se/"

With it available, we can use it in our WebAPIs which should also support these
domains, e.g:
  testing.expectEqual('xn--rksmrgs-5wao1o.se', new URL('https://räksmörgås.se').hostname);

There is more integration to be done here, but this is a first step.

claude wrote all of the build.zig code.

I don't have a strong opinion about this feature, I just dislike that our WPT
/url/* tests are at 1704 / 9095 and, this is the biggest chunk (although, this
specific commit just does the basic integration and probably won't fix too many
WPT cases directly).
This commit is contained in:
Karl Seguin
2026-04-28 19:42:26 +08:00
parent 827626db67
commit 9fe628dd0f
7 changed files with 2225 additions and 2 deletions

View File

@@ -17,6 +17,8 @@
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const idna = @import("../sys/idna.zig");
const Allocator = std.mem.Allocator;
pub const ResolveOpts = struct {
@@ -190,11 +192,35 @@ pub fn resolve(allocator: Allocator, base: [:0]const u8, source_path: anytype, o
}
fn processResolved(allocator: Allocator, url: [:0]const u8, opts: ResolveOpts) ![:0]const u8 {
const encoding = opts.encoding orelse return url;
const encoding = opts.encoding orelse return ensureHostAscii(allocator, url);
return ensureEncoded(allocator, url, encoding);
}
pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8, encoding: []const u8) ![:0]const u8 {
/// IDNA-only pass: converts a non-ASCII host (`räksmörgås.se`) to its
/// punycode form (`xn--rksmrgs-5wao1o.se`) and leaves everything else alone.
fn ensureHostAscii(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
const hostname = getHostname(url);
if (hostname.len == 0 or !idna.needsAscii(hostname)) {
return url;
}
const ascii = try idna.toAscii(allocator, hostname);
// hostname is a slice of url, so its start offset is just pointer arithmetic.
const start = @intFromPtr(hostname.ptr) - @intFromPtr(url.ptr);
const end = start + hostname.len;
var buf = try std.ArrayList(u8).initCapacity(allocator, url.len - hostname.len + ascii.len + 1);
buf.appendSliceAssumeCapacity(url[0..start]);
buf.appendSliceAssumeCapacity(ascii);
buf.appendSliceAssumeCapacity(url[end..]);
buf.appendAssumeCapacity(0);
return buf.items[0 .. buf.items.len - 1 :0];
}
pub fn ensureEncoded(allocator: Allocator, url_in: [:0]const u8, encoding: []const u8) ![:0]const u8 {
// Resolve any IDN host first; everything below operates on the ASCII form.
const url = try ensureHostAscii(allocator, url_in);
const scheme_end = std.mem.indexOf(u8, url, "://");
const authority_start = if (scheme_end) |end| end + 3 else 0;
const path_start = std.mem.indexOfScalarPos(u8, url, authority_start, '/') orelse return url;

View File

@@ -871,3 +871,52 @@
testing.expectEqual('', url.search);
}
</script>
<script id=idna>
// WHATWG "domain to ASCII": non-ASCII hosts are converted to punycode at
// parse time, so getters always return the ASCII form.
{
const url = new URL('https://räksmörgås.se/');
testing.expectEqual('xn--rksmrgs-5wao1o.se', url.hostname);
testing.expectEqual('xn--rksmrgs-5wao1o.se', url.host);
testing.expectEqual('https://xn--rksmrgs-5wao1o.se/', url.href);
}
// UTS#46 non-transitional processing preserves ß rather than mapping to ss.
{
const url = new URL('https://faß.de/');
testing.expectEqual('xn--fa-hia.de', url.hostname);
}
// Pure-ASCII hosts must not be touched.
{
const url = new URL('https://example.com/');
testing.expectEqual('example.com', url.hostname);
testing.expectEqual('https://example.com/', url.href);
}
// IDN preserved alongside port, userinfo, path, query, and fragment.
{
const url = new URL('https://räksmörgås.se:8443/p?q=1#h');
testing.expectEqual('xn--rksmrgs-5wao1o.se', url.hostname);
testing.expectEqual('xn--rksmrgs-5wao1o.se:8443', url.host);
testing.expectEqual('8443', url.port);
testing.expectEqual('/p', url.pathname);
testing.expectEqual('?q=1', url.search);
testing.expectEqual('#h', url.hash);
}
{
const url = new URL('https://user:pass@räksmörgås.se/');
testing.expectEqual('xn--rksmrgs-5wao1o.se', url.hostname);
testing.expectEqual('user', url.username);
testing.expectEqual('pass', url.password);
}
// Resolving a relative path against an IDN base preserves the punycode host.
{
const url = new URL('/about', 'https://räksmörgås.se/');
testing.expectEqual('xn--rksmrgs-5wao1o.se', url.hostname);
testing.expectEqual('https://xn--rksmrgs-5wao1o.se/about', url.href);
}
</script>

View File

@@ -763,6 +763,7 @@ const CloneError = error{
NotImplemented,
InvalidCharacterError,
CloneError,
Idna,
IFrameLoadError,
TooManyContexts,
LinkLoadError,

76
src/sys/idna.zig Normal file
View File

@@ -0,0 +1,76 @@
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const c = @cImport({
@cInclude("idn2.h");
});
const Allocator = std.mem.Allocator;
pub const Error = error{Idna} || Allocator.Error;
/// True if `host` contains any non-ASCII byte and therefore needs IDNA
/// processing. Pure-ASCII hostnames are returned unchanged by `toAscii`,
/// so callers can use this as a fast path to skip the C call entirely.
pub fn needsAscii(host: []const u8) bool {
for (host) |byte| {
if (byte >= 0x80) {
return true;
}
}
return false;
}
/// Convert a UTF-8 hostname to its ASCII (Punycode) form per UTS#46
/// IDNA 2008 with non-transitional processing — the algorithm WHATWG URL
/// invokes as "domain to ASCII". Returns an allocator-owned slice.
pub fn toAscii(allocator: Allocator, host: []const u8) Error![]u8 {
const host_z = try allocator.dupeZ(u8, host);
defer allocator.free(host_z);
var out_ptr: [*c]u8 = undefined;
const flags: c_int = c.IDN2_NFC_INPUT | c.IDN2_NONTRANSITIONAL;
const rc = c.idn2_to_ascii_8z(host_z.ptr, &out_ptr, flags);
if (rc != c.IDN2_OK) {
return error.Idna;
}
defer c.idn2_free(out_ptr);
return try allocator.dupe(u8, std.mem.span(@as([*:0]const u8, @ptrCast(out_ptr))));
}
const testing = @import("../testing.zig");
test "idna: ASCII passthrough" {
try testing.expectEqual(false, needsAscii("example.com"));
const out = try toAscii(testing.allocator, "example.com");
defer testing.allocator.free(out);
try testing.expectString("example.com", out);
}
test "idna: non-ASCII to punycode" {
try testing.expectEqual(true, needsAscii("räksmörgås.se"));
const out = try toAscii(testing.allocator, "räksmörgås.se");
defer testing.allocator.free(out);
try testing.expectString("xn--rksmrgs-5wao1o.se", out);
}
test "idna: German sharp s with non-transitional processing" {
// UTS#46 non-transitional preserves ß rather than mapping to ss.
const out = try toAscii(testing.allocator, "faß.de");
defer testing.allocator.free(out);
try testing.expectString("xn--fa-hia.de", out);
}