diff --git a/control/controlknobs/controlknobs.go b/control/controlknobs/controlknobs.go index d41b6703c..93c10f26e 100644 --- a/control/controlknobs/controlknobs.go +++ b/control/controlknobs/controlknobs.go @@ -130,6 +130,11 @@ type Knobs struct { // DisableTUNTCPGRO disables TCP GRO on the Tailscale TUN device. See // [tailcfg.NodeAttrDisableTUNTCPGRO]. DisableTUNTCPGRO atomic.Bool + + // NeverGSOEqualTail enables a UDP GSO sentinel-tail workaround in the + // underlay UDP packet TX path on Linux. Applies to magicsock and peer relay + // UDP sockets. See [tailcfg.NodeAttrNeverGSOEqualTail]. + NeverGSOEqualTail atomic.Bool } // UpdateFromNodeAttributes updates k (if non-nil) based on the provided self @@ -164,6 +169,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) { disableUDPGSO = has(tailcfg.NodeAttrDisableUDPGSO) disableTUNUDPGRO = has(tailcfg.NodeAttrDisableTUNUDPGRO) disableTUNTCPGRO = has(tailcfg.NodeAttrDisableTUNTCPGRO) + neverGSOEqualTail = has(tailcfg.NodeAttrNeverGSOEqualTail) ) if has(tailcfg.NodeAttrOneCGNATEnable) { @@ -196,6 +202,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) { k.DisableUDPGSO.Store(disableUDPGSO) k.DisableTUNUDPGRO.Store(disableTUNUDPGRO) k.DisableTUNTCPGRO.Store(disableTUNTCPGRO) + k.NeverGSOEqualTail.Store(neverGSOEqualTail) } // AsDebugJSON returns k as something that can be marshalled with json.Marshal diff --git a/net/batching/conn_linux.go b/net/batching/conn_linux.go index 1ddb08b0b..1718e98dd 100644 --- a/net/batching/conn_linux.go +++ b/net/batching/conn_linux.go @@ -60,6 +60,12 @@ type linuxBatchingConn struct { txOffload atomic.Bool // supports UDP GSO or similar sendBatchPool sync.Pool rxqOverflowsMetric *clientmetric.Metric + // neverGSOEqualTail, when non-nil and true, enables a sentinel-tail + // workaround in the UDP GSO TX path. It points at a + // [controlknobs.Knobs.NeverGSOEqualTail] field so the value can be + // toggled live via the control plane without requiring a socket rebind. + // It is read once per write at the top of [linuxBatchingConn.WriteBatchTo]. + neverGSOEqualTail *atomic.Bool // readOpMu guards read operations that must perform accounting against // rxqOverflows in single-threaded fashion. There are no concurrent usages @@ -107,6 +113,12 @@ func (c *linuxBatchingConn) SetWriteDeadline(t time.Time) error { maxIPv6PayloadLen = 1<<16 - 1 - 8 ) +// neverGSOEqualTailSentinelPayload is appended to UDP GSO packet batches under +// certain conditions in order to workaround Linux kernel UDP GSO bugs. In the +// case of magicsock, 0x07 is handled as WireGuard, and wireguard-go silently +// drops the packet as it's less than [device.MinMessageSize]. +var neverGSOEqualTailSentinelPayload = []byte{0x07} + // coalesceMessages iterates 'buffs', setting and coalescing them in 'msgs' // where possible while maintaining datagram order. // @@ -120,20 +132,44 @@ func (c *linuxBatchingConn) SetWriteDeadline(t time.Time) error { // // All msgs[i].Buffers[0] are preceded by a Geneve header (geneve) if geneve.VNI.IsSet(). // +// neverGSOEqualTail, when true, enables the sentinel-tail workaround. It is +// loaded by the caller and passed in so a single coalesceMessages call sees a +// consistent value even if the underlying control knob flips concurrently. +// // TODO(illotum) explore MSG_ZEROCOPY for large writes (>10KB). -func (c *linuxBatchingConn) coalesceMessages(addr *net.UDPAddr, geneve packet.GeneveHeader, buffs [][]byte, msgs []ipv6.Message, offset int) int { +func (c *linuxBatchingConn) coalesceMessages(addr *net.UDPAddr, geneve packet.GeneveHeader, buffs [][]byte, msgs []ipv6.Message, offset int, neverGSOEqualTail bool) int { var ( - base = -1 // index of msg we are currently coalescing into - gsoSize int // segmentation size of msgs[base] - dgramCnt int // number of dgrams coalesced into msgs[base] - endBatch bool // tracking flag to start a new batch on next iteration of buffs - coalescedLen int // bytes coalesced into msgs[base] + base = -1 // index of msg we are currently coalescing into + gsoSize int // segmentation size of msgs[base] + dgramCnt int // number of dgrams coalesced into msgs[base] + endBatchDueToSmallerTail bool // tracking flag to start a new batch on next iteration of buffs + coalescedLen int // bytes coalesced into msgs[base] ) maxPayloadLen := maxIPv4PayloadLen if addr.IP.To4() == nil { maxPayloadLen = maxIPv6PayloadLen } + maxDatagramsPerGSOBatch := udpSegmentMaxDatagrams + if neverGSOEqualTail { + // If neverGSOEqualTail is set we might end up appending a sentinel 1-byte + // payload, so we must leave space in our accounting. + maxDatagramsPerGSOBatch -= 1 + maxPayloadLen -= len(neverGSOEqualTailSentinelPayload) + } vniIsSet := geneve.VNI.IsSet() + + maybeAppendSentinelTail := func() { + if !neverGSOEqualTail || endBatchDueToSmallerTail { + // If neverGSOEqualTail is unset we should never append a sentinel + // payload as we are running on an unaffected kernel. Or, if we + // already have a smaller-than-GSO sized tail, there is no need, since + // the kernel bug we are avoiding only triggers when all fragments + // are equal in length. + return + } + msgs[base].Buffers = append(msgs[base].Buffers, neverGSOEqualTailSentinelPayload) + } + for i, buff := range buffs { if vniIsSet { geneve.Encode(buff) @@ -142,32 +178,48 @@ func (c *linuxBatchingConn) coalesceMessages(addr *net.UDPAddr, geneve packet.Ge } if i > 0 { msgLen := len(buff) + // okToCoalesceWithSentinel ensures we never coalesce if a sentinel + // 1-byte payload might be required, but gsoSize (or more specifically + // UDP payload length) is also 1. The whole point of appending a sentinel + // 1-byte payload is to append a smaller-than-GSO tail. + // + // This is defensive as a 1-byte payload, at the time of writing + // (2026-05-28), is unlikely to occur. The smallest WireGuard + // message size is 32 bytes ([device.MinMessageSize]), and the + // [disco.Message] header is 62 bytes. + // + // It's also overly conservative as it checks for msgLen == 1, but a + // msgLen of 1 on the tail where gsoSize is greater would also be fine. + okToCoalesceWithSentinel := !neverGSOEqualTail || msgLen > len(neverGSOEqualTailSentinelPayload) if msgLen+coalescedLen <= maxPayloadLen && msgLen <= gsoSize && - dgramCnt < udpSegmentMaxDatagrams && - !endBatch { + dgramCnt < maxDatagramsPerGSOBatch && + !endBatchDueToSmallerTail && + okToCoalesceWithSentinel { // msgs[base].Buffers[0] is set to buff[i] when a new base is set. // This appends a struct iovec element in the underlying struct msghdr (scatter-gather). msgs[base].Buffers = append(msgs[base].Buffers, buff) - if i == len(buffs)-1 { - setGSOSizeInControl(&msgs[base].OOB, uint16(gsoSize)) - } dgramCnt++ coalescedLen += msgLen if msgLen < gsoSize { // A smaller than gsoSize packet on the tail is legal, but // it must end the batch. - endBatch = true + endBatchDueToSmallerTail = true + } + if i == len(buffs)-1 { + maybeAppendSentinelTail() + setGSOSizeInControl(&msgs[base].OOB, uint16(gsoSize)) } continue } } if dgramCnt > 1 { + maybeAppendSentinelTail() setGSOSizeInControl(&msgs[base].OOB, uint16(gsoSize)) } // Reset prior to incrementing base since we are preparing to start a // new potential batch. - endBatch = false + endBatchDueToSmallerTail = false base++ gsoSize = len(buff) msgs[base].OOB = msgs[base].OOB[:0] @@ -199,6 +251,27 @@ func (c *linuxBatchingConn) putSendBatch(batch *sendBatch) { c.sendBatchPool.Put(batch) } +// appendSentinelTailBatchSizeThreshold represents the minimum batch size +// required to enter [linuxBatchingConn.coalesceMessages] when +// [linuxBatchingConn.neverGSOEqualTail] is set. If the batch of packets is less +// than this value, and neverGSOEqualTail is set, we avoid UDP GSO altogether. +// Appending a sentinel packet, regardless of size, is still overhead on sender, +// middle network, and receiver. +// +// Coalescing (UDP GSO) greatly improves performance for sender (and receiver if +// they support UDP GRO), but there are diminishing returns if batches are small. +// We attempt to balance these diminishing returns against the introduction of +// dead-weight sentinel packets. +// +// The initial value of 8 is a power of 2, and in the worst case leads to 6% +// payload overhead if the batch is made up of minimum-sized WireGuard transport +// messages (empty payload keepalives). Worst case is unlikely. +// +// 8 * (20 bytes IPv4 header + 8 byte UDP header + 32 byte WG message) = 480 bytes +// sentinel tail is 20 byte IPv4 header + 8 byte UDP header + 1 byte payload = 29 bytes +// 29/480 = 0.060... +const appendSentinelTailBatchSizeThreshold = 8 + func (c *linuxBatchingConn) WriteBatchTo(buffs [][]byte, addr netip.AddrPort, geneve packet.GeneveHeader, offset int) error { batch := c.getSendBatch() defer c.putSendBatch(batch) @@ -212,13 +285,16 @@ func (c *linuxBatchingConn) WriteBatchTo(buffs [][]byte, addr netip.AddrPort, ge batch.ua.IP = batch.ua.IP[:4] } batch.ua.Port = int(addr.Port()) + // Load the control knob once per write so a single call sees a consistent + // value even if the knob flips concurrently. + neverGSOEqualTail := c.neverGSOEqualTail != nil && c.neverGSOEqualTail.Load() var ( n int retried bool ) retry: - if c.txOffload.Load() { - n = c.coalesceMessages(batch.ua, geneve, buffs, batch.msgs, offset) + if c.txOffload.Load() && (!neverGSOEqualTail || len(buffs) >= appendSentinelTailBatchSizeThreshold) { + n = c.coalesceMessages(batch.ua, geneve, buffs, batch.msgs, offset, neverGSOEqualTail) } else { vniIsSet := geneve.VNI.IsSet() if vniIsSet { @@ -535,7 +611,8 @@ func TryUpgradeToConn(pconn nettype.PacketConn, network string, batchSize int, r if network != "udp4" && network != "udp6" { return pconn } - if strings.HasPrefix(hostinfo.GetOSVersion(), "2.") { + osVer := hostinfo.GetOSVersion() + if strings.HasPrefix(osVer, "2.") { // recvmmsg/sendmmsg were added in 2.6.33, but we support down to // 2.6.32 for old NAS devices. See https://github.com/tailscale/tailscale/issues/6807. // As a cheap heuristic: if the Linux kernel starts with "2", just @@ -579,6 +656,9 @@ func TryUpgradeToConn(pconn nettype.PacketConn, network string, batchSize int, r var txOffload bool txOffload, b.rxOffload = tryEnableUDPOffload(uc, knobs) b.txOffload.Store(txOffload) + if knobs != nil { + b.neverGSOEqualTail = &knobs.NeverGSOEqualTail + } if len(rxqOverflowsMetricName) > 0 && tryEnableRXQOverflowsCounter(uc) { // Don't register the metric unless the socket option has been // successfully set, otherwise we will report a misleading zero value diff --git a/net/batching/conn_linux_test.go b/net/batching/conn_linux_test.go index fa4eef33c..857c3d9d7 100644 --- a/net/batching/conn_linux_test.go +++ b/net/batching/conn_linux_test.go @@ -140,8 +140,6 @@ func Test_linuxBatchingConn_splitCoalescedMessages(t *testing.T) { } func Test_linuxBatchingConn_coalesceMessages(t *testing.T) { - c := &linuxBatchingConn{} - withGeneveSpace := func(len, cap int) []byte { return make([]byte, len+packet.GeneveFixedHeaderLength, cap+packet.GeneveFixedHeaderLength) } @@ -152,13 +150,17 @@ func Test_linuxBatchingConn_coalesceMessages(t *testing.T) { geneve.VNI.Set(1) cases := []struct { - name string - buffs [][]byte - geneve packet.GeneveHeader + name string + buffs [][]byte + geneve packet.GeneveHeader + neverGSOEqualTail bool // Each wantLens slice corresponds to the Buffers of a single coalesced message, // and each int is the expected length of the corresponding Buffer[i]. wantLens [][]int wantGSO []int + // wantSentinelAtTail[i], when true, asserts that the tail entry of + // msgs[i].Buffers is the shared neverGSOEqualTailSentinelPayload slice. + wantSentinelAtTail []bool }{ { name: "one-message-no-coalesce", @@ -257,10 +259,113 @@ func Test_linuxBatchingConn_coalesceMessages(t *testing.T) { wantLens: [][]int{{2 + packet.GeneveFixedHeaderLength, 2 + packet.GeneveFixedHeaderLength, 2 + packet.GeneveFixedHeaderLength}}, wantGSO: []int{2 + packet.GeneveFixedHeaderLength}, }, + { + name: "two-equal-len-coalesce-neverGSOEqualTail-appends-sentinel", + buffs: [][]byte{ + withGeneveSpace(3, 3), + withGeneveSpace(3, 3), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{3, 3, len(neverGSOEqualTailSentinelPayload)}}, + wantGSO: []int{3}, + wantSentinelAtTail: []bool{true}, + }, + { + name: "two-equal-len-coalesce-neverGSOEqualTail-vni-isSet-appends-sentinel", + buffs: [][]byte{ + withGeneveSpace(3, 3+packet.GeneveFixedHeaderLength), + withGeneveSpace(3, 3), + }, + geneve: geneve, + neverGSOEqualTail: true, + wantLens: [][]int{{3 + packet.GeneveFixedHeaderLength, 3 + packet.GeneveFixedHeaderLength, len(neverGSOEqualTailSentinelPayload)}}, + wantGSO: []int{3 + packet.GeneveFixedHeaderLength}, + wantSentinelAtTail: []bool{true}, + }, + { + name: "two-unequal-len-coalesce-neverGSOEqualTail-smaller-tail-no-sentinel", + buffs: [][]byte{ + withGeneveSpace(3, 3), + withGeneveSpace(2, 2), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{3, 2}}, + wantGSO: []int{3}, + }, + { + name: "one-byte-tail-neverGSOEqualTail-not-coalesced", + // okToCoalesceWithSentinel is false when msgLen == 1 and + // neverGSOEqualTail is set; the 1-byte tail is split into + // its own non-coalesced singleton msg. + buffs: [][]byte{ + withGeneveSpace(2, 2), + withGeneveSpace(1, 1), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{2}, {1}}, + wantGSO: []int{0, 0}, + }, + { + name: "one-byte-tail-neverGSOEqualTail-vni-isSet-coalesced", + // With vniIsSet, msgLen always includes the Geneve header, so + // okToCoalesceWithSentinel is true even for "1-byte payloads". + // The naturally smaller tail short-circuits the sentinel. + buffs: [][]byte{ + withGeneveSpace(2, 2+packet.GeneveFixedHeaderLength), + withGeneveSpace(1, 1), + }, + geneve: geneve, + neverGSOEqualTail: true, + wantLens: [][]int{{2 + packet.GeneveFixedHeaderLength, 1 + packet.GeneveFixedHeaderLength}}, + wantGSO: []int{2 + packet.GeneveFixedHeaderLength}, + }, + { + name: "batch-boundary-sentinel-appended-on-prior-batch-neverGSOEqualTail", + // The 4th buff (length 5) is larger than gsoSize=3 so it + // closes the first batch. The first batch has dgramCnt > 1 and + // no smaller tail, so the sentinel is appended before starting + // the new batch. + buffs: [][]byte{ + withGeneveSpace(3, 3), + withGeneveSpace(3, 3), + withGeneveSpace(3, 3), + withGeneveSpace(5, 5), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{3, 3, 3, len(neverGSOEqualTailSentinelPayload)}, {5}}, + wantGSO: []int{3, 0}, + wantSentinelAtTail: []bool{true, false}, + }, + { + name: "single-buff-neverGSOEqualTail-no-sentinel", + // Only one datagram, no GSO happening, no sentinel. + buffs: [][]byte{ + withGeneveSpace(3, 3), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{3}}, + wantGSO: []int{0}, + }, + { + name: "equal-len-then-smaller-tail-then-equal-neverGSOEqualTail", + // The smaller tail ends the first batch with no sentinel + // (variation already provided), then a second singleton batch + // is started for the trailing equal-length buff. + buffs: [][]byte{ + withGeneveSpace(3, 3), + withGeneveSpace(3, 3), + withGeneveSpace(2, 2), + withGeneveSpace(3, 3), + }, + neverGSOEqualTail: true, + wantLens: [][]int{{3, 3, 2}, {3}}, + wantGSO: []int{3, 0}, + }, } for _, tt := range cases { t.Run(tt.name, func(t *testing.T) { + c := &linuxBatchingConn{} addr := &net.UDPAddr{ IP: net.ParseIP("127.0.0.1"), Port: 1, @@ -270,7 +375,7 @@ func Test_linuxBatchingConn_coalesceMessages(t *testing.T) { msgs[i].Buffers = make([][]byte, 1) msgs[i].OOB = make([]byte, controlMessageSize) } - got := c.coalesceMessages(addr, tt.geneve, tt.buffs, msgs, packet.GeneveFixedHeaderLength) + got := c.coalesceMessages(addr, tt.geneve, tt.buffs, msgs, packet.GeneveFixedHeaderLength, tt.neverGSOEqualTail) if got != len(tt.wantLens) { t.Fatalf("got len %d want: %d", got, len(tt.wantLens)) } @@ -288,6 +393,15 @@ func Test_linuxBatchingConn_coalesceMessages(t *testing.T) { } } + wantSentinel := i < len(tt.wantSentinelAtTail) && tt.wantSentinelAtTail[i] + if wantSentinel { + tail := msgs[i].Buffers[len(msgs[i].Buffers)-1] + if len(tail) != len(neverGSOEqualTailSentinelPayload) || + &tail[0] != &neverGSOEqualTailSentinelPayload[0] { + t.Errorf("msgs[%d] tail buffer is not neverGSOEqualTailSentinelPayload", i) + } + } + // coalesceMessages calls setGSOSizeInControl, which uses a cmsg // type of UDP_SEGMENT, and getGSOSizeInControl scans for a cmsg // type of UDP_GRO. Therefore, we have to use the lower-level diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 65d4fcdd5..96ae15f5c 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -187,7 +187,8 @@ // - 138: 2026-03-31: can handle C2N /debug/tka. // - 139: 2026-05-22: Client understands [NodeAttrEmitRuntimeMetrics] // - 140: 2026-05-27: Client understands [NodeAttrDisableUDPGRO], [NodeAttrDisableUDPGSO], [NodeAttrDisableTUNUDPGRO], [NodeAttrDisableTUNTCPGRO] -const CurrentCapabilityVersion CapabilityVersion = 140 +// - 141: 2026-05-28: Client understands [NodeAttrNeverGSOEqualTail] +const CurrentCapabilityVersion CapabilityVersion = 141 // ID is an integer ID for a user, node, or login allocated by the // control plane. @@ -2830,6 +2831,15 @@ func (p NodeCapabilityPrefix) ToAttribute(value string) NodeCapability { // Currently only consulted on Linux; may apply to other platforms as they // gain TUN TCP GRO support. NodeAttrDisableTUNTCPGRO NodeCapability = "disable-tun-tcp-gro" + + // NodeAttrNeverGSOEqualTail enables a sentinel-tail workaround in the + // underlay UDP packet TX path on Linux. Applies to magicsock and peer relay + // UDP sockets. The workaround avoids emitting UDP GSO batches whose + // fragments are all equal in length, at a small payload and packet overhead + // cost. It exists so control can mitigate kernel regressions that mangle + // UDP headers or checksums for equal-length GSO batches, without requiring + // a client release. See https://github.com/tailscale/tailscale/issues/19777. + NodeAttrNeverGSOEqualTail NodeCapability = "never-gso-equal-tail" ) const (