url: add component encode set and use in tools

This commit is contained in:
Adrià Arrufat
2026-05-04 07:52:33 +02:00
parent 2dffd6099f
commit 5df44fd9f4
2 changed files with 58 additions and 51 deletions

View File

@@ -269,9 +269,21 @@ pub fn ensureEncoded(allocator: Allocator, url_in: [:0]const u8, encoding: []con
return buf.items[0 .. buf.items.len - 1 :0];
}
const EncodeSet = enum { path, query, query_legacy, userinfo, fragment };
/// Selects which RFC 3986 / WHATWG URL Standard percent-encode set to apply.
///
/// The `path`, `query`, `query_legacy`, `userinfo`, and `fragment` variants
/// match the corresponding URL spec sets — they assume the input is already
/// structured (e.g. `key=val&key=val` for `query`) and only encode characters
/// disallowed in that location.
///
/// `component` is stricter: it encodes everything outside the RFC 3986
/// unreserved set, including sub-delims (`& = + ! * ' ( ) , $ ;`). Use this
/// when embedding an arbitrary string as a single URI component such as a
/// query-parameter value, where reserved characters in the input would
/// otherwise change the URL's structure.
pub const EncodeSet = enum { path, query, query_legacy, userinfo, fragment, component };
fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime encode_set: EncodeSet) ![]const u8 {
pub fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime encode_set: EncodeSet) ![]const u8 {
// Check if encoding is needed
var needs_encoding = false;
for (segment) |c| {
@@ -290,8 +302,11 @@ fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime enco
while (i < segment.len) : (i += 1) {
const c = segment[i];
// Check if this is an already-encoded sequence (%XX)
if (c == '%' and i + 2 < segment.len) {
// For URL-canonicalization sets, preserve existing %XX sequences so
// already-encoded inputs round-trip cleanly. The `component` set treats
// input as opaque and re-encodes `%` itself, since the caller is
// embedding raw user data and a literal '%' must not be misread.
if (encode_set != .component and c == '%' and i + 2 < segment.len) {
const end = i + 2;
const h1 = segment[i + 1];
const h2 = segment[end];
@@ -362,13 +377,14 @@ fn shouldPercentEncode(c: u8, comptime encode_set: EncodeSet) bool {
return switch (c) {
// Unreserved characters (RFC 3986)
'A'...'Z', 'a'...'z', '0'...'9', '-', '.', '_', '~' => false,
// sub-delims allowed in path/query but some must be encoded in userinfo/query_legacy
'!', '$', '\'', '(', ')', '*', '+', ',' => false,
// sub-delims allowed in path/query but some must be encoded in userinfo/query_legacy/component
'!', '$', '\'', '(', ')', '*', '+', ',' => encode_set == .component,
// '&' and ';' must be encoded for legacy encoding (to preserve NCRs like &#nnnnn;)
'&', ';' => encode_set == .userinfo or encode_set == .query_legacy,
'=' => encode_set == .userinfo,
// Separators: userinfo must encode these
'/', ':', '@' => encode_set == .userinfo,
// and for component encoding (so a value can't break out into a new param)
'&', ';' => encode_set == .userinfo or encode_set == .query_legacy or encode_set == .component,
'=' => encode_set == .userinfo or encode_set == .component,
// Separators: userinfo and component must encode these
'/', ':', '@' => encode_set == .userinfo or encode_set == .component,
// '?' is allowed in queries only
'?' => encode_set != .query and encode_set != .query_legacy,
// '#' is allowed in fragments only
@@ -1210,6 +1226,37 @@ test "URL: ensureEncoded" {
}
}
test "URL: percentEncodeSegment component passes unreserved chars through" {
defer testing.reset();
const r = try percentEncodeSegment(testing.arena_allocator, "abcXYZ012-._~", .component);
try testing.expectString("abcXYZ012-._~", r);
}
test "URL: percentEncodeSegment component encodes spaces, sub-delims and reserved chars" {
defer testing.reset();
const r = try percentEncodeSegment(testing.arena_allocator, "hello world&q=1", .component);
try testing.expectString("hello%20world%26q%3D1", r);
const r2 = try percentEncodeSegment(testing.arena_allocator, "a+b!c*d", .component);
try testing.expectString("a%2Bb%21c%2Ad", r2);
}
test "URL: percentEncodeSegment component encodes UTF-8 bytes" {
defer testing.reset();
const r = try percentEncodeSegment(testing.arena_allocator, "café", .component);
try testing.expectString("caf%C3%A9", r);
}
test "URL: percentEncodeSegment component re-encodes literal '%' in raw input" {
defer testing.reset();
const r = try percentEncodeSegment(testing.arena_allocator, "100%", .component);
try testing.expectString("100%25", r);
// Even when followed by hex digits, treat as opaque user data.
const r2 = try percentEncodeSegment(testing.arena_allocator, "100%2A", .component);
try testing.expectString("100%252A", r2);
}
test "URL: resolve with encoding" {
defer testing.reset();

View File

@@ -417,7 +417,7 @@ fn execSearch(session: *lp.Session, arena: std.mem.Allocator, registry: *CDPNode
const args = try parseArgsOrErr(SearchParams, arena, arguments) orelse return ToolError.InvalidParams;
if (args.query.len == 0) return ToolError.InvalidParams;
const encoded = percentEncodeQuery(arena, args.query) catch return ToolError.OutOfMemory;
const encoded = lp.URL.percentEncodeSegment(arena, args.query, .component) catch return ToolError.OutOfMemory;
const google_url = std.fmt.allocPrintSentinel(
arena,
"https://www.google.com/search?q={s}&hl=en&gl=us",
@@ -459,21 +459,6 @@ fn renderFrameMarkdown(arena: std.mem.Allocator, frame: *lp.Frame) ToolError![]c
return aw.written();
}
fn percentEncodeQuery(arena: std.mem.Allocator, input: []const u8) error{OutOfMemory}![]const u8 {
var out: std.ArrayList(u8) = .empty;
for (input) |c| {
switch (c) {
'A'...'Z', 'a'...'z', '0'...'9', '-', '.', '_', '~' => try out.append(arena, c),
else => {
var hex: [3]u8 = undefined;
_ = std.fmt.bufPrint(&hex, "%{X:0>2}", .{c}) catch unreachable;
try out.appendSlice(arena, &hex);
},
}
}
return out.toOwnedSlice(arena);
}
fn execMarkdown(session: *lp.Session, arena: std.mem.Allocator, registry: *CDPNode.Registry, arguments: ?std.json.Value) ToolError![]const u8 {
const args = try parseArgsOrDefault(UrlParams, arena, arguments);
const page = try ensurePage(session, registry, args.url, args.timeout, args.waitUntil);
@@ -1014,31 +999,6 @@ test "substituteEnvVars missing var kept literal" {
try std.testing.expectEqualStrings("$UNLIKELY_VAR_12345", r);
}
test "percentEncodeQuery passes unreserved chars through" {
var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
defer arena.deinit();
const r = try percentEncodeQuery(arena.allocator(), "abcXYZ012-._~");
try std.testing.expectEqualStrings("abcXYZ012-._~", r);
}
test "percentEncodeQuery encodes spaces and reserved chars" {
var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
defer arena.deinit();
const r = try percentEncodeQuery(arena.allocator(), "hello world&q=1");
try std.testing.expectEqualStrings("hello%20world%26q%3D1", r);
}
test "percentEncodeQuery encodes UTF-8 bytes" {
var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
defer arena.deinit();
// "café" → c, a, f, then 0xC3 0xA9 for é
const r = try percentEncodeQuery(arena.allocator(), "café");
try std.testing.expectEqualStrings("caf%C3%A9", r);
}
test "substituteEnvVars bare dollar" {
var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
defer arena.deinit();