From b4d55eb863508df122f7e34adf6b15c079f3525e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Arrufat?= Date: Tue, 26 May 2026 16:52:09 +0200 Subject: [PATCH] string: extract truncateUtf8 helper Moves the UTF-8 truncation logic from Spinner's `utf8FloorTo` to a shared `truncateUtf8` function in `string.zig`. Reuses it in both Spinner and markdown rendering, and adds comprehensive tests. --- src/agent/Spinner.zig | 25 +++++++------------------ src/browser/markdown.zig | 3 ++- src/string.zig | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/src/agent/Spinner.zig b/src/agent/Spinner.zig index 36612514..a0993f88 100644 --- a/src/agent/Spinner.zig +++ b/src/agent/Spinner.zig @@ -21,6 +21,7 @@ const lp = @import("lightpanda"); const log = lp.log; const Terminal = @import("Terminal.zig"); const ansi = Terminal.ansi; +const truncateUtf8 = @import("../string.zig").truncateUtf8; const Spinner = @This(); @@ -168,13 +169,15 @@ pub fn setTool(self: *Spinner, name: []const u8, args: []const u8) void { const manual = self.state == .idle; self.tool_calls += 1; var tool: ToolState = .{ .set_ns = std.time.nanoTimestamp(), .manual = manual }; - tool.name_len = utf8FloorTo(name, tool.name_buf.len); - @memcpy(tool.name_buf[0..tool.name_len], name[0..tool.name_len]); + const name_prefix = truncateUtf8(name, tool.name_buf.len); + tool.name_len = name_prefix.len; + @memcpy(tool.name_buf[0..name_prefix.len], name_prefix); // Strip control chars: a literal `\n` in args (e.g. /eval """…""" bodies) // breaks the spinner's `\r`-based redraw — the cursor only rewinds to the // start of the last line, leaving prior frames stuck on screen. - tool.args_len = utf8FloorTo(args, tool.args_buf.len); - for (args[0..tool.args_len], 0..) |ch, i| { + const args_prefix = truncateUtf8(args, tool.args_buf.len); + tool.args_len = args_prefix.len; + for (args_prefix, 0..) |ch, i| { tool.args_buf[i] = if (ch < 0x20 or ch == 0x7f) ' ' else ch; } self.state = .{ .tool = tool }; @@ -287,20 +290,6 @@ fn renderLocked(self: *Spinner) void { _ = std.posix.write(std.posix.STDERR_FILENO, written) catch {}; } -/// Largest prefix length of `bytes` that fits in `max_bytes` and ends on -/// a UTF-8 codepoint boundary. Invalid sequences are treated as one byte -/// each so the function never loops. -fn utf8FloorTo(bytes: []const u8, max_bytes: usize) usize { - if (bytes.len <= max_bytes) return bytes.len; - var i: usize = 0; - while (i < max_bytes) { - const seq_len = std.unicode.utf8ByteSequenceLength(bytes[i]) catch 1; - if (i + seq_len > max_bytes) break; - i += seq_len; - } - return i; -} - /// Returns the byte length of `bytes` that fits in `max_cells` cells, /// rounded down to a whole UTF-8 codepoint. Multi-cell glyphs (CJK, /// wide emoji) are counted as 1 — args are typically ASCII so the diff --git a/src/browser/markdown.zig b/src/browser/markdown.zig index d547e131..c542c7e4 100644 --- a/src/browser/markdown.zig +++ b/src/browser/markdown.zig @@ -24,6 +24,7 @@ const TreeWalker = @import("webapi/TreeWalker.zig"); const Element = @import("webapi/Element.zig"); const Node = @import("webapi/Node.zig"); const isAllWhitespace = @import("../string.zig").isAllWhitespace; +const truncateUtf8 = @import("../string.zig").truncateUtf8; pub const Opts = struct { max_bytes: ?u32 = null, @@ -72,7 +73,7 @@ const LimitedWriter = struct { return; } if (self.remaining > 0) { - try self.inner.writeAll(bytes[0..self.remaining]); + try self.inner.writeAll(truncateUtf8(bytes, self.remaining)); self.remaining = 0; } self.truncated = true; diff --git a/src/string.zig b/src/string.zig index c91b75d9..24bfe482 100644 --- a/src/string.zig +++ b/src/string.zig @@ -311,6 +311,20 @@ pub fn isAllWhitespace(text: []const u8) bool { } else true; } +/// Largest prefix of `bytes` whose length is at most `max_bytes` and +/// ends on a UTF-8 codepoint boundary. Invalid sequences count as one +/// byte each so the function never loops. +pub fn truncateUtf8(bytes: []const u8, max_bytes: usize) []const u8 { + if (bytes.len <= max_bytes) return bytes; + var i: usize = 0; + while (i < max_bytes) { + const seq_len = std.unicode.utf8ByteSequenceLength(bytes[i]) catch 1; + if (i + seq_len > max_bytes) break; + i += seq_len; + } + return bytes[0..i]; +} + // Discriminatory type that signals the bridge to use arena instead of call_arena // Use this for strings that need to persist beyond the current call // The caller can unwrap and store just the underlying .str field @@ -333,6 +347,30 @@ fn asUint(comptime string: anytype) std.meta.Int( const testing = @import("testing.zig"); +test "truncateUtf8" { + try testing.expectEqual("", truncateUtf8("", 10)); + try testing.expectEqual("abc", truncateUtf8("abc", 10)); + try testing.expectEqual("abc", truncateUtf8("abcdef", 3)); + + // 'é' = 0xC3 0xA9 — cap inside the codepoint walks back to the leader. + try testing.expectEqual("", truncateUtf8("é", 1)); + try testing.expectEqual("é", truncateUtf8("é", 2)); + try testing.expectEqual("é", truncateUtf8("éé", 3)); + + // 3-byte codepoint '世' = 0xE4 0xB8 0x96. + try testing.expectEqual("", truncateUtf8("世", 2)); + try testing.expectEqual("世", truncateUtf8("世界", 3)); + try testing.expectEqual("世", truncateUtf8("世界", 5)); + + // 4-byte codepoint '𝄞' (musical G clef) = 0xF0 0x9D 0x84 0x9E. + try testing.expectEqual("", truncateUtf8("𝄞", 3)); + try testing.expectEqual("𝄞", truncateUtf8("𝄞x", 4)); + + // Invalid leader byte counts as one byte so the loop terminates. + try testing.expectEqual("\xFF", truncateUtf8("\xFFx", 1)); + try testing.expectEqual("\xFFx", truncateUtf8("\xFFx", 2)); +} + test "String" { const other_short = try String.init(undefined, "other_short", .{}); const other_long = try String.init(testing.allocator, "other_long" ** 100, .{});