From 8b58bd6c645ce1b07dfe09e76e01a0ec15cb51c2 Mon Sep 17 00:00:00 2001 From: Jordan Whited Date: Thu, 28 May 2026 12:29:24 -0700 Subject: [PATCH] net/batching: implement NodeAttrNeverGSOEqualTail This NodeCapability works around the UDP GSO bugs introduced by torvalds/linux@b10b446 (v7.0-rc1). These bugs were later fixed by torvalds/linux@78effd8 and torvalds/linux@5f17ae0 (v7.1-rc5). These Linux kernel bugs cause mangled UDP headers and UDP checksums, resulting in high levels of packet loss. The aforementioned bugs have already made their way downstream into various distros, e.g. Ubuntu 26.04 LTS. Impacted users are now dealing with poor UDP performance in tailscaled, and in any other software that makes use of UDP GSO. Not all users of the affected kernels are impacted as the relevant kernel code path sits between kernel and netdev driver, and behaviors vary by driver/device capability. We cannot detect impact at runtime, as this would require gathering all netdevs, and performing loopback tests. This is invasive and in many cases impossible. So, we are left to choose between disabling UDP GSO for all users on affected kernels, whether they experience real impact or not, or try and work around the bugs. Disabling UDP GSO for a user that is not impacted can cut max throughput in half, and consume more CPU cycles. This commit attempts to workaround the bugs by avoiding UDP GSO when batches are small, and injecting a 1-byte sentinel tail payload when they are large. This tail payload is smaller than "GSO size", which sidesteps the primary trigger of all fragments in a batch being equal in length. The end result is slightly increased payload and packet overhead, but functional UDP GSO for all Linux 7.0-7.1.4 users, regardless of netdev/driver. Updates #19777 Signed-off-by: Jordan Whited --- control/controlknobs/controlknobs.go | 7 ++ net/batching/conn_linux.go | 112 ++++++++++++++++++++---- net/batching/conn_linux_test.go | 126 +++++++++++++++++++++++++-- tailcfg/tailcfg.go | 12 ++- 4 files changed, 234 insertions(+), 23 deletions(-) diff --git a/control/controlknobs/controlknobs.go b/control/controlknobs/controlknobs.go index d41b6703c..93c10f26e 100644 --- a/control/controlknobs/controlknobs.go +++ b/control/controlknobs/controlknobs.go @@ -130,6 +130,11 @@ type Knobs struct { // DisableTUNTCPGRO disables TCP GRO on the Tailscale TUN device. See // [tailcfg.NodeAttrDisableTUNTCPGRO]. DisableTUNTCPGRO atomic.Bool + + // NeverGSOEqualTail enables a UDP GSO sentinel-tail workaround in the + // underlay UDP packet TX path on Linux. Applies to magicsock and peer relay + // UDP sockets. See [tailcfg.NodeAttrNeverGSOEqualTail]. + NeverGSOEqualTail atomic.Bool } // UpdateFromNodeAttributes updates k (if non-nil) based on the provided self @@ -164,6 +169,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) { disableUDPGSO = has(tailcfg.NodeAttrDisableUDPGSO) disableTUNUDPGRO = has(tailcfg.NodeAttrDisableTUNUDPGRO) disableTUNTCPGRO = has(tailcfg.NodeAttrDisableTUNTCPGRO) + neverGSOEqualTail = has(tailcfg.NodeAttrNeverGSOEqualTail) ) if has(tailcfg.NodeAttrOneCGNATEnable) { @@ -196,6 +202,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) { k.DisableUDPGSO.Store(disableUDPGSO) k.DisableTUNUDPGRO.Store(disableTUNUDPGRO) k.DisableTUNTCPGRO.Store(disableTUNTCPGRO) + k.NeverGSOEqualTail.Store(neverGSOEqualTail) } // AsDebugJSON returns k as something that can be marshalled with json.Marshal diff --git a/net/batching/conn_linux.go b/net/batching/conn_linux.go index 1ddb08b0b..1718e98dd 100644 --- a/net/batching/conn_linux.go +++ b/net/batching/conn_linux.go @@ -60,6 +60,12 @@ type linuxBatchingConn struct { txOffload atomic.Bool // supports UDP GSO or similar sendBatchPool sync.Pool rxqOverflowsMetric *clientmetric.Metric + // neverGSOEqualTail, when non-nil and true, enables a sentinel-tail + // workaround in the UDP GSO TX path. It points at a + // [controlknobs.Knobs.NeverGSOEqualTail] field so the value can be + // toggled live via the control plane without requiring a socket rebind. + // It is read once per write at the top of [linuxBatchingConn.WriteBatchTo]. + neverGSOEqualTail *atomic.Bool // readOpMu guards read operations that must perform accounting against // rxqOverflows in single-threaded fashion. There are no concurrent usages @@ -107,6 +113,12 @@ func (c *linuxBatchingConn) SetWriteDeadline(t time.Time) error { maxIPv6PayloadLen = 1<<16 - 1 - 8 ) +// neverGSOEqualTailSentinelPayload is appended to UDP GSO packet batches under +// certain conditions in order to workaround Linux kernel UDP GSO bugs. In the +// case of magicsock, 0x07 is handled as WireGuard, and wireguard-go silently +// drops the packet as it's less than [device.MinMessageSize]. +var neverGSOEqualTailSentinelPayload = []byte{0x07} + // coalesceMessages iterates 'buffs', setting and coalescing them in 'msgs' // where possible while maintaining datagram order. // @@ -120,20 +132,44 @@ func (c *linuxBatchingConn) SetWriteDeadline(t time.Time) error { // // All msgs[i].Buffers[0] are preceded by a Geneve header (geneve) if geneve.VNI.IsSet(). // +// neverGSOEqualTail, when true, enables the sentinel-tail workaround. It is +// loaded by the caller and passed in so a single coalesceMessages call sees a +// consistent value even if the underlying control knob flips concurrently. +// // TODO(illotum) explore MSG_ZEROCOPY for large writes (>10KB). -func (c *linuxBatchingConn) coalesceMessages(addr *net.UDPAddr, geneve packet.GeneveHeader, buffs [][]byte, msgs []ipv6.Message, offset int) int { +func (c *linuxBatchingConn) coalesceMessages(addr *net.UDPAddr, geneve packet.GeneveHeader, buffs [][]byte, msgs []ipv6.Message, offset int, neverGSOEqualTail bool) int { var ( - base = -1 // index of msg we are currently coalescing into - gsoSize int // segmentation size of msgs[base] - dgramCnt int // number of dgrams coalesced into msgs[base] - endBatch bool // tracking flag to start a new batch on next iteration of buffs - coalescedLen int // bytes coalesced into msgs[base] + base = -1 // index of msg we are currently coalescing into + gsoSize int // segmentation size of msgs[base] + dgramCnt int // number of dgrams coalesced into msgs[base] + endBatchDueToSmallerTail bool // tracking flag to start a new batch on next iteration of buffs + coalescedLen int // bytes coalesced into msgs[base] ) maxPayloadLen := maxIPv4PayloadLen if addr.IP.To4() == nil { maxPayloadLen = maxIPv6PayloadLen } + maxDatagramsPerGSOBatch := udpSegmentMaxDatagrams + if neverGSOEqualTail { + // If neverGSOEqualTail is set we might end up appending a sentinel 1-byte + // payload, so we must leave space in our accounting. + maxDatagramsPerGSOBatch -= 1 + maxPayloadLen -= len(neverGSOEqualTailSentinelPayload) + } vniIsSet := geneve.VNI.IsSet() + + maybeAppendSentinelTail := func() { + if !neverGSOEqualTail || endBatchDueToSmallerTail { + // If neverGSOEqualTail is unset we should never append a sentinel + // payload as we are running on an unaffected kernel. Or, if we + // already have a smaller-than-GSO sized tail, there is no need, since + // the kernel bug we are avoiding only triggers when all fragments + // are equal in length. + return + } + msgs[base].Buffers = append(msgs[base].Buffers, neverGSOEqualTailSentinelPayload) + } + for i, buff := range buffs { if vniIsSet { geneve.Encode(buff) @@ -142,32 +178,48 @@ func (c *linuxBatchingConn) coalesceMessages(addr *net.UDPAddr, geneve packet.Ge } if i > 0 { msgLen := len(buff) + // okToCoalesceWithSentinel ensures we never coalesce if a sentinel + // 1-byte payload might be required, but gsoSize (or more specifically + // UDP payload length) is also 1. The whole point of appending a sentinel + // 1-byte payload is to append a smaller-than-GSO tail. + // + // This is defensive as a 1-byte payload, at the time of writing + // (2026-05-28), is unlikely to occur. The smallest WireGuard + // message size is 32 bytes ([device.MinMessageSize]), and the + // [disco.Message] header is 62 bytes. + // + // It's also overly conservative as it checks for msgLen == 1, but a + // msgLen of 1 on the tail where gsoSize is greater would also be fine. + okToCoalesceWithSentinel := !neverGSOEqualTail || msgLen > len(neverGSOEqualTailSentinelPayload) if msgLen+coalescedLen <= maxPayloadLen && msgLen <= gsoSize && - dgramCnt < udpSegmentMaxDatagrams && - !endBatch { + dgramCnt < maxDatagramsPerGSOBatch && + !endBatchDueToSmallerTail && + okToCoalesceWithSentinel { // msgs[base].Buffers[0] is set to buff[i] when a new base is set. // This appends a struct iovec element in the underlying struct msghdr (scatter-gather). msgs[base].Buffers = append(msgs[base].Buffers, buff) - if i == len(buffs)-1 { - setGSOSizeInControl(&msgs[base].OOB, uint16(gsoSize)) - } dgramCnt++ coalescedLen += msgLen if msgLen < gsoSize { // A smaller than gsoSize packet on the tail is legal, but // it must end the batch. - endBatch = true + endBatchDueToSmallerTail = true + } + if i == len(buffs)-1 { + maybeAppendSentinelTail() + setGSOSizeInControl(&msgs[base].OOB, uint16(gsoSize)) } continue } } if dgramCnt > 1 { + maybeAppendSentinelTail() setGSOSizeInControl(&msgs[base].OOB, uint16(gsoSize)) } // Reset prior to incrementing base since we are preparing to start a // new potential batch. - endBatch = false + endBatchDueToSmallerTail = false base++ gsoSize = len(buff) msgs[base].OOB = msgs[base].OOB[:0] @@ -199,6 +251,27 @@ func (c *linuxBatchingConn) putSendBatch(batch *sendBatch) { c.sendBatchPool.Put(batch) } +// appendSentinelTailBatchSizeThreshold represents the minimum batch size +// required to enter [linuxBatchingConn.coalesceMessages] when +// [linuxBatchingConn.neverGSOEqualTail] is set. If the batch of packets is less +// than this value, and neverGSOEqualTail is set, we avoid UDP GSO altogether. +// Appending a sentinel packet, regardless of size, is still overhead on sender, +// middle network, and receiver. +// +// Coalescing (UDP GSO) greatly improves performance for sender (and receiver if +// they support UDP GRO), but there are diminishing returns if batches are small. +// We attempt to balance these diminishing returns against the introduction of +// dead-weight sentinel packets. +// +// The initial value of 8 is a power of 2, and in the worst case leads to 6% +// payload overhead if the batch is made up of minimum-sized WireGuard transport +// messages (empty payload keepalives). Worst case is unlikely. +// +// 8 * (20 bytes IPv4 header + 8 byte UDP header + 32 byte WG message) = 480 bytes +// sentinel tail is 20 byte IPv4 header + 8 byte UDP header + 1 byte payload = 29 bytes +// 29/480 = 0.060... +const appendSentinelTailBatchSizeThreshold = 8 + func (c *linuxBatchingConn) WriteBatchTo(buffs [][]byte, addr netip.AddrPort, geneve packet.GeneveHeader, offset int) error { batch := c.getSendBatch() defer c.putSendBatch(batch) @@ -212,13 +285,16 @@ func (c *linuxBatchingConn) WriteBatchTo(buffs [][]byte, addr netip.AddrPort, ge batch.ua.IP = batch.ua.IP[:4] } batch.ua.Port = int(addr.Port()) + // Load the control knob once per write so a single call sees a consistent + // value even if the knob flips concurrently. + neverGSOEqualTail := c.neverGSOEqualTail != nil && c.neverGSOEqualTail.Load() var ( n int retried bool ) retry: - if c.txOffload.Load() { - n = c.coalesceMessages(batch.ua, geneve, buffs, batch.msgs, offset) + if c.txOffload.Load() && (!neverGSOEqualTail || len(buffs) >= appendSentinelTailBatchSizeThreshold) { + n = c.coalesceMessages(batch.ua, geneve, buffs, batch.msgs, offset, neverGSOEqualTail) } else { vniIsSet := geneve.VNI.IsSet() if vniIsSet { @@ -535,7 +611,8 @@ func TryUpgradeToConn(pconn nettype.PacketConn, network string, batchSize int, r if network != "udp4" && network != "udp6" { return pconn } - if strings.HasPrefix(hostinfo.GetOSVersion(), "2.") { + osVer := hostinfo.GetOSVersion() + if strings.HasPrefix(osVer, "2.") { // recvmmsg/sendmmsg were added in 2.6.33, but we support down to // 2.6.32 for old NAS devices. See https://github.com/tailscale/tailscale/issues/6807. // As a cheap heuristic: if the Linux kernel starts with "2", just @@ -579,6 +656,9 @@ func TryUpgradeToConn(pconn nettype.PacketConn, network string, batchSize int, r var txOffload bool txOffload, b.rxOffload = tryEnableUDPOffload(uc, knobs) b.txOffload.Store(txOffload) + if knobs != nil { + b.neverGSOEqualTail = &knobs.NeverGSOEqualTail + } if len(rxqOverflowsMetricName) > 0 && tryEnableRXQOverflowsCounter(uc) { // Don't register the metric unless the socket option has been // successfully set, otherwise we will report a misleading zero value diff --git a/net/batching/conn_linux_test.go b/net/batching/conn_linux_test.go index fa4eef33c..857c3d9d7 100644 --- a/net/batching/conn_linux_test.go +++ b/net/batching/conn_linux_test.go @@ -140,8 +140,6 @@ func Test_linuxBatchingConn_splitCoalescedMessages(t *testing.T) { } func Test_linuxBatchingConn_coalesceMessages(t *testing.T) { - c := &linuxBatchingConn{} - withGeneveSpace := func(len, cap int) []byte { return make([]byte, len+packet.GeneveFixedHeaderLength, cap+packet.GeneveFixedHeaderLength) } @@ -152,13 +150,17 @@ func Test_linuxBatchingConn_coalesceMessages(t *testing.T) { geneve.VNI.Set(1) cases := []struct { - name string - buffs [][]byte - geneve packet.GeneveHeader + name string + buffs [][]byte + geneve packet.GeneveHeader + neverGSOEqualTail bool // Each wantLens slice corresponds to the Buffers of a single coalesced message, // and each int is the expected length of the corresponding Buffer[i]. wantLens [][]int wantGSO []int + // wantSentinelAtTail[i], when true, asserts that the tail entry of + // msgs[i].Buffers is the shared neverGSOEqualTailSentinelPayload slice. + wantSentinelAtTail []bool }{ { name: "one-message-no-coalesce", @@ -257,10 +259,113 @@ func Test_linuxBatchingConn_coalesceMessages(t *testing.T) { wantLens: [][]int{{2 + packet.GeneveFixedHeaderLength, 2 + packet.GeneveFixedHeaderLength, 2 + packet.GeneveFixedHeaderLength}}, wantGSO: []int{2 + packet.GeneveFixedHeaderLength}, }, + { + name: "two-equal-len-coalesce-neverGSOEqualTail-appends-sentinel", + buffs: [][]byte{ + withGeneveSpace(3, 3), + withGeneveSpace(3, 3), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{3, 3, len(neverGSOEqualTailSentinelPayload)}}, + wantGSO: []int{3}, + wantSentinelAtTail: []bool{true}, + }, + { + name: "two-equal-len-coalesce-neverGSOEqualTail-vni-isSet-appends-sentinel", + buffs: [][]byte{ + withGeneveSpace(3, 3+packet.GeneveFixedHeaderLength), + withGeneveSpace(3, 3), + }, + geneve: geneve, + neverGSOEqualTail: true, + wantLens: [][]int{{3 + packet.GeneveFixedHeaderLength, 3 + packet.GeneveFixedHeaderLength, len(neverGSOEqualTailSentinelPayload)}}, + wantGSO: []int{3 + packet.GeneveFixedHeaderLength}, + wantSentinelAtTail: []bool{true}, + }, + { + name: "two-unequal-len-coalesce-neverGSOEqualTail-smaller-tail-no-sentinel", + buffs: [][]byte{ + withGeneveSpace(3, 3), + withGeneveSpace(2, 2), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{3, 2}}, + wantGSO: []int{3}, + }, + { + name: "one-byte-tail-neverGSOEqualTail-not-coalesced", + // okToCoalesceWithSentinel is false when msgLen == 1 and + // neverGSOEqualTail is set; the 1-byte tail is split into + // its own non-coalesced singleton msg. + buffs: [][]byte{ + withGeneveSpace(2, 2), + withGeneveSpace(1, 1), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{2}, {1}}, + wantGSO: []int{0, 0}, + }, + { + name: "one-byte-tail-neverGSOEqualTail-vni-isSet-coalesced", + // With vniIsSet, msgLen always includes the Geneve header, so + // okToCoalesceWithSentinel is true even for "1-byte payloads". + // The naturally smaller tail short-circuits the sentinel. + buffs: [][]byte{ + withGeneveSpace(2, 2+packet.GeneveFixedHeaderLength), + withGeneveSpace(1, 1), + }, + geneve: geneve, + neverGSOEqualTail: true, + wantLens: [][]int{{2 + packet.GeneveFixedHeaderLength, 1 + packet.GeneveFixedHeaderLength}}, + wantGSO: []int{2 + packet.GeneveFixedHeaderLength}, + }, + { + name: "batch-boundary-sentinel-appended-on-prior-batch-neverGSOEqualTail", + // The 4th buff (length 5) is larger than gsoSize=3 so it + // closes the first batch. The first batch has dgramCnt > 1 and + // no smaller tail, so the sentinel is appended before starting + // the new batch. + buffs: [][]byte{ + withGeneveSpace(3, 3), + withGeneveSpace(3, 3), + withGeneveSpace(3, 3), + withGeneveSpace(5, 5), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{3, 3, 3, len(neverGSOEqualTailSentinelPayload)}, {5}}, + wantGSO: []int{3, 0}, + wantSentinelAtTail: []bool{true, false}, + }, + { + name: "single-buff-neverGSOEqualTail-no-sentinel", + // Only one datagram, no GSO happening, no sentinel. + buffs: [][]byte{ + withGeneveSpace(3, 3), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{3}}, + wantGSO: []int{0}, + }, + { + name: "equal-len-then-smaller-tail-then-equal-neverGSOEqualTail", + // The smaller tail ends the first batch with no sentinel + // (variation already provided), then a second singleton batch + // is started for the trailing equal-length buff. + buffs: [][]byte{ + withGeneveSpace(3, 3), + withGeneveSpace(3, 3), + withGeneveSpace(2, 2), + withGeneveSpace(3, 3), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{3, 3, 2}, {3}}, + wantGSO: []int{3, 0}, + }, } for _, tt := range cases { t.Run(tt.name, func(t *testing.T) { + c := &linuxBatchingConn{} addr := &net.UDPAddr{ IP: net.ParseIP("127.0.0.1"), Port: 1, @@ -270,7 +375,7 @@ func Test_linuxBatchingConn_coalesceMessages(t *testing.T) { msgs[i].Buffers = make([][]byte, 1) msgs[i].OOB = make([]byte, controlMessageSize) } - got := c.coalesceMessages(addr, tt.geneve, tt.buffs, msgs, packet.GeneveFixedHeaderLength) + got := c.coalesceMessages(addr, tt.geneve, tt.buffs, msgs, packet.GeneveFixedHeaderLength, tt.neverGSOEqualTail) if got != len(tt.wantLens) { t.Fatalf("got len %d want: %d", got, len(tt.wantLens)) } @@ -288,6 +393,15 @@ func Test_linuxBatchingConn_coalesceMessages(t *testing.T) { } } + wantSentinel := i < len(tt.wantSentinelAtTail) && tt.wantSentinelAtTail[i] + if wantSentinel { + tail := msgs[i].Buffers[len(msgs[i].Buffers)-1] + if len(tail) != len(neverGSOEqualTailSentinelPayload) || + &tail[0] != &neverGSOEqualTailSentinelPayload[0] { + t.Errorf("msgs[%d] tail buffer is not neverGSOEqualTailSentinelPayload", i) + } + } + // coalesceMessages calls setGSOSizeInControl, which uses a cmsg // type of UDP_SEGMENT, and getGSOSizeInControl scans for a cmsg // type of UDP_GRO. Therefore, we have to use the lower-level diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 65d4fcdd5..96ae15f5c 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -187,7 +187,8 @@ // - 138: 2026-03-31: can handle C2N /debug/tka. // - 139: 2026-05-22: Client understands [NodeAttrEmitRuntimeMetrics] // - 140: 2026-05-27: Client understands [NodeAttrDisableUDPGRO], [NodeAttrDisableUDPGSO], [NodeAttrDisableTUNUDPGRO], [NodeAttrDisableTUNTCPGRO] -const CurrentCapabilityVersion CapabilityVersion = 140 +// - 141: 2026-05-28: Client understands [NodeAttrNeverGSOEqualTail] +const CurrentCapabilityVersion CapabilityVersion = 141 // ID is an integer ID for a user, node, or login allocated by the // control plane. @@ -2830,6 +2831,15 @@ func (p NodeCapabilityPrefix) ToAttribute(value string) NodeCapability { // Currently only consulted on Linux; may apply to other platforms as they // gain TUN TCP GRO support. NodeAttrDisableTUNTCPGRO NodeCapability = "disable-tun-tcp-gro" + + // NodeAttrNeverGSOEqualTail enables a sentinel-tail workaround in the + // underlay UDP packet TX path on Linux. Applies to magicsock and peer relay + // UDP sockets. The workaround avoids emitting UDP GSO batches whose + // fragments are all equal in length, at a small payload and packet overhead + // cost. It exists so control can mitigate kernel regressions that mangle + // UDP headers or checksums for equal-length GSO batches, without requiring + // a client release. See https://github.com/tailscale/tailscale/issues/19777. + NodeAttrNeverGSOEqualTail NodeCapability = "never-gso-equal-tail" ) const (