From 7c6f2a2a8c80fe8b9c649f24365ad98e000e9817 Mon Sep 17 00:00:00 2001 From: Navid EMAD Date: Wed, 29 Apr 2026 01:53:17 +0200 Subject: [PATCH] forms: normalize CR/LF to CRLF in form-data set encoding Per the HTML form-data set encoding algorithm, every U+000A (LF) not preceded by U+000D (CR) and every U+000D (CR) not followed by U+000A (LF) in an entry's name or value is replaced with the two-byte sequence CR+LF before percent-encoding. Without this pass, a textarea API value containing LF (per the textarea wrapping transformation) submits as raw %0A on the wire instead of the spec-required %0D%0A. The normalization is gated on URLEncodeMode == .form so URLSearchParams (.query) still follows the URL standard's serializer, which doesn't normalize. Same pass covers names + values for both the POST application/x-www-form-urlencoded body and the GET query-string path (both go through the same encoder before URL.concatQueryString). Closes #2307 --- src/browser/webapi/KeyValueList.zig | 120 +++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 2 deletions(-) diff --git a/src/browser/webapi/KeyValueList.zig b/src/browser/webapi/KeyValueList.zig index cb4e7c50..ca1e5016 100644 --- a/src/browser/webapi/KeyValueList.zig +++ b/src/browser/webapi/KeyValueList.zig @@ -259,7 +259,12 @@ fn urlEncodeValueUtf8(value: []const u8, comptime mode: URLEncodeMode, writer: * return writer.writeAll(value); } - for (value) |b| { + var i: usize = 0; + while (i < value.len) : (i += 1) { + const b = value[i]; + if (comptime mode == .form) { + if (try writeFormLineEnd(value, &i, b, writer)) continue; + } if (urlEncodeUnreserved(b, mode)) { try writer.writeByte(b); } else if (b == ' ') { @@ -272,7 +277,12 @@ fn urlEncodeValueUtf8(value: []const u8, comptime mode: URLEncodeMode, writer: * /// Percent-encode a legacy-encoded value - must also encode & and ; to preserve NCRs. fn urlEncodeValueLegacy(value: []const u8, comptime mode: URLEncodeMode, writer: *std.Io.Writer) !void { - for (value) |b| { + var i: usize = 0; + while (i < value.len) : (i += 1) { + const b = value[i]; + if (comptime mode == .form) { + if (try writeFormLineEnd(value, &i, b, writer)) continue; + } if (urlEncodeUnreserved(b, mode)) { try writer.writeByte(b); } else if (b == ' ') { @@ -286,6 +296,25 @@ fn urlEncodeValueLegacy(value: []const u8, comptime mode: URLEncodeMode, writer: } } +// HTML form-data set encoding algorithm: every U+000D (CR) not followed by +// U+000A (LF), and every U+000A (LF) not preceded by U+000D (CR), is replaced +// with the two-byte sequence CR+LF before percent-encoding. Returns true (and +// emits "%0D%0A") when `b` is CR or LF; on CR, advances the caller's index +// past a following LF so existing CRLF pairs aren't doubled. +// https://html.spec.whatwg.org/multipage/form-control-infrastructure.html#url-encoded-form-data +fn writeFormLineEnd(value: []const u8, i: *usize, b: u8, writer: *std.Io.Writer) !bool { + if (b == '\r') { + try writer.writeAll("%0D%0A"); + if (i.* + 1 < value.len and value[i.* + 1] == '\n') i.* += 1; + return true; + } + if (b == '\n') { + try writer.writeAll("%0D%0A"); + return true; + } + return false; +} + fn urlEncodeShouldEscape(value: []const u8, comptime mode: URLEncodeMode) bool { for (value) |b| { if (!urlEncodeUnreserved(b, mode)) { @@ -409,3 +438,90 @@ test "KeyValueList: urlEncode Big5 unmappable character" { // 炣 percent-encoded is %26%2328835%3B try testing.expectString("q=%26%2328835%3B", buf.written()); } + +// HTML form-data set encoding algorithm: line endings in entry names and values +// are normalized to CRLF — every stray LF (not preceded by CR) and every stray +// CR (not followed by LF) is replaced with CR+LF before percent-encoding. The +// normalization applies only to .form mode; URLSearchParams (.query) follows +// the URL standard's serializer, which doesn't normalize. +// https://html.spec.whatwg.org/multipage/form-control-infrastructure.html#url-encoded-form-data +test "KeyValueList: urlEncode .form normalizes stray LF to CRLF" { + const allocator = testing.arena_allocator; + var list = KeyValueList.init(); + try list.append(allocator, "msg", "line1\nline2\nline3"); + + var buf = std.Io.Writer.Allocating.init(allocator); + try list.urlEncode(.form, null, "UTF-8", &buf.writer); + + try testing.expectString("msg=line1%0D%0Aline2%0D%0Aline3", buf.written()); +} + +test "KeyValueList: urlEncode .form normalizes stray CR to CRLF" { + const allocator = testing.arena_allocator; + var list = KeyValueList.init(); + try list.append(allocator, "msg", "line1\rline2"); + + var buf = std.Io.Writer.Allocating.init(allocator); + try list.urlEncode(.form, null, "UTF-8", &buf.writer); + + try testing.expectString("msg=line1%0D%0Aline2", buf.written()); +} + +test "KeyValueList: urlEncode .form preserves existing CRLF" { + const allocator = testing.arena_allocator; + var list = KeyValueList.init(); + try list.append(allocator, "msg", "line1\r\nline2"); + + var buf = std.Io.Writer.Allocating.init(allocator); + try list.urlEncode(.form, null, "UTF-8", &buf.writer); + + try testing.expectString("msg=line1%0D%0Aline2", buf.written()); +} + +test "KeyValueList: urlEncode .form handles mixed line endings" { + const allocator = testing.arena_allocator; + var list = KeyValueList.init(); + // CR LF, then bare LF, then bare CR -> three CRLF sequences. + try list.append(allocator, "msg", "a\r\nb\nc\rd"); + + var buf = std.Io.Writer.Allocating.init(allocator); + try list.urlEncode(.form, null, "UTF-8", &buf.writer); + + try testing.expectString("msg=a%0D%0Ab%0D%0Ac%0D%0Ad", buf.written()); +} + +test "KeyValueList: urlEncode .form normalizes line endings in entry names" { + const allocator = testing.arena_allocator; + var list = KeyValueList.init(); + try list.append(allocator, "n\nm", "v"); + + var buf = std.Io.Writer.Allocating.init(allocator); + try list.urlEncode(.form, null, "UTF-8", &buf.writer); + + try testing.expectString("n%0D%0Am=v", buf.written()); +} + +test "KeyValueList: urlEncode .form normalizes legacy charsets too" { + const allocator = testing.arena_allocator; + var list = KeyValueList.init(); + try list.append(allocator, "msg", "a\nb"); + + var buf = std.Io.Writer.Allocating.init(allocator); + try list.urlEncode(.form, allocator, "GBK", &buf.writer); + + try testing.expectString("msg=a%0D%0Ab", buf.written()); +} + +test "KeyValueList: urlEncode .query does NOT normalize line endings" { + // URL standard's application/x-www-form-urlencoded serializer (used by + // URLSearchParams) does not perform CRLF normalization — only the HTML + // form-data set encoding wrapper does. https://url.spec.whatwg.org/#concept-urlencoded-serializer + const allocator = testing.arena_allocator; + var list = KeyValueList.init(); + try list.append(allocator, "msg", "a\nb\rc"); + + var buf = std.Io.Writer.Allocating.init(allocator); + try list.urlEncode(.query, null, "UTF-8", &buf.writer); + + try testing.expectString("msg=a%0Ab%0Dc", buf.written()); +}