From e1e49c8a2e1ab863fb529ff94531e308183a3fb5 Mon Sep 17 00:00:00 2001 From: Navid EMAD Date: Wed, 20 May 2026 19:13:09 +0200 Subject: [PATCH] Fix Network dropping CDP sockets from its poll set once a multi exists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit preparePollFds cleared and rebuilt the curl portion of `pollfds` every loop iteration, but sliced `pollfds[PSEUDO_POLLFDS..]` — all the way to the end of the array. That range also covers the CDP socket region `[cdp_start..]`, which prepareCdpPollFds owns and only rebuilds when `cdp_dirty` is set (a steady-state optimization). So the @memset wiped every live CDP socket fd to -1 on each iteration. This only bites once Network owns a curl multi handle, which is created solely by telemetry — and telemetry is disabled in Debug builds, which is why it reproduced only in ReleaseFast/ReleaseSafe (and the nightly). Regular HTTP/navigation runs on the worker's own handles, not Network's multi, so it never triggered the path in Debug. Once the CDP sockets are dropped from the poll set, the Network thread stops reading client messages (#2508, hard stall after the first command) and never observes peer EOF or `conn.shutdown`, so the worker is never told to exit and SIGTERM is ignored after a connection (#2507). Fix: slice only the curl region `[PSEUDO_POLLFDS..cdp_start]`. Also harden the poll timeout: `curl_multi_timeout` returns -1 when curl is idle, and `@min(250, -1)` is -1 (block forever), which starved onTick (telemetry's periodic flush) and turned any missed wakeup into a permanent hang rather than a <=250ms blip. Treat curl_timeout <= 0 as "no deadline" and fall back to the 250ms cap. Fixes #2507 Fixes #2508 --- src/network/Network.zig | 57 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/src/network/Network.zig b/src/network/Network.zig index 13243d08..1e51f042 100644 --- a/src/network/Network.zig +++ b/src/network/Network.zig @@ -722,8 +722,15 @@ pub fn run(self: *Network) void { break :blk min_timeout; } + // curl_multi_timeout reports -1 when curl has no timeout + // preference (idle) and 0 when it wants to be serviced + // immediately. Treat both as "no curl-imposed deadline" and + // fall back to min_timeout — otherwise @min(min_timeout, -1) + // would be -1, i.e. poll() blocks forever, starving onTick + // (telemetry's periodic flush) and removing the safety net + // that bounds any missed wakeup to min_timeout. const curl_timeout = self.getCurlTimeout(); - if (curl_timeout == 0) { + if (curl_timeout <= 0) { break :blk min_timeout; } @@ -860,7 +867,16 @@ fn acceptConnections(self: *Network) void { } fn preparePollFds(self: *Network, multi: *libcurl.CurlM) void { - const curl_fds = self.pollfds[PSEUDO_POLLFDS..]; + // Only the curl slice — NOT through to the end of pollfds. The CDP + // socket fds live in [cdp_start..] and are owned by + // prepareCdpPollFds, which only rebuilds them when cdp_dirty is set + // (a steady-state optimization). Slicing to the end here would + // @memset those fds to -1 every iteration once a multi exists (which + // happens as soon as telemetry sends its first request), silently + // dropping every live CDP socket from the poll set — Network then + // never reads another CDP message (#2508) nor observes peer + // EOF/shutdown (#2507). + const curl_fds = self.pollfds[PSEUDO_POLLFDS..self.cdp_start]; @memset(curl_fds, .{ .fd = -1, .events = 0, .revents = 0 }); var fd_count: c_uint = 0; @@ -1053,3 +1069,40 @@ fn loadCerts(allocator: Allocator) !libcurl.CurlBlob { .flags = 0, }; } + +const testing = @import("../testing.zig"); + +test "Network: preparePollFds leaves the CDP fd region untouched" { + // Regression for #2507 / #2508. Once a multi exists (telemetry creates + // one in optimized builds), preparePollFds runs every loop iteration. + // It rebuilds only the curl slice [PSEUDO_POLLFDS..cdp_start]; the CDP + // region [cdp_start..] is owned by prepareCdpPollFds, which keeps its + // entries across iterations and only rebuilds when cdp_dirty is set. + // A slice that ran to the end of pollfds @memset those CDP sockets to + // -1, silently dropping every live CDP connection from the poll set — + // so Network stopped reading CDP messages (#2508) and never observed + // peer EOF/shutdown (#2507). curl global is initialized by the test + // harness (App.init -> Network.init). + const multi = libcurl.curl_multi_init() orelse return error.FailedToInitMulti; + defer libcurl.curl_multi_cleanup(multi) catch {}; + + const curl_slots = 4; + const cdp_slots = 3; + var pollfds: [PSEUDO_POLLFDS + curl_slots + cdp_slots]posix.pollfd = undefined; + @memset(&pollfds, .{ .fd = -1, .events = 0, .revents = 0 }); + + // preparePollFds only reads self.pollfds and self.cdp_start. + var nw: Network = undefined; + nw.pollfds = &pollfds; + nw.cdp_start = PSEUDO_POLLFDS + curl_slots; + + // Two live CDP sockets parked in the CDP region, mimicking the steady + // state between cdp_dirty rebuilds. + pollfds[nw.cdp_start] = .{ .fd = 4242, .events = posix.POLL.IN, .revents = 0 }; + pollfds[nw.cdp_start + 1] = .{ .fd = 4243, .events = posix.POLL.IN, .revents = 0 }; + + nw.preparePollFds(multi); + + try testing.expectEqual(@as(posix.fd_t, 4242), pollfds[nw.cdp_start].fd); + try testing.expectEqual(@as(posix.fd_t, 4243), pollfds[nw.cdp_start + 1].fd); +}