mirror of
https://github.com/tailscale/tailscale.git
synced 2026-03-26 18:21:15 -04:00
prober: fix queuing delay probe txRecords overflow under high DERP server load (#18803)
The txRecords buffer had two compounding bugs that caused the overflow guard to fire on every send tick under high DERP server load, spamming logs at the full send rate (e.g. 100x/second). First, int(packetTimeout.Seconds()) truncates fractional-second timeouts, under-allocating the buffer. Second, the capacity was sized to exactly the theoretical maximum number of in-flight records with no headroom, and the expiry check used strict > rather than >=, so records at exactly the timeout boundary were never evicted by applyTimeouts, leaving len==cap on the very next tick. Fixes tailscale/corp#37696 Signed-off-by: Mike O'Driscoll <mikeo@tailscale.com>
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
"io"
|
||||
"log"
|
||||
"maps"
|
||||
"math"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/netip"
|
||||
@@ -423,7 +424,7 @@ type txRecord struct {
|
||||
// for packets up to their timeout. As records age out of the front of this
|
||||
// list, if the associated packet arrives, we won't have a txRecord for it
|
||||
// and will consider it to have timed out.
|
||||
txRecords := make([]txRecord, 0, packetsPerSecond*int(packetTimeout.Seconds()))
|
||||
txRecords := make([]txRecord, 0, int(math.Ceil(float64(packetsPerSecond)*packetTimeout.Seconds()))+1)
|
||||
var txRecordsMu sync.Mutex
|
||||
|
||||
// applyTimeouts walks over txRecords and expires any records that are older
|
||||
@@ -435,7 +436,7 @@ type txRecord struct {
|
||||
now := time.Now()
|
||||
recs := txRecords[:0]
|
||||
for _, r := range txRecords {
|
||||
if now.Sub(r.at) > packetTimeout {
|
||||
if now.Sub(r.at) >= packetTimeout {
|
||||
packetsDropped.Add(1)
|
||||
} else {
|
||||
recs = append(recs, r)
|
||||
@@ -451,9 +452,7 @@ type txRecord struct {
|
||||
pkt := make([]byte, 260) // the same size as a CallMeMaybe packet observed on a Tailscale client.
|
||||
crand.Read(pkt)
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
wg.Go(func() {
|
||||
t := time.NewTicker(time.Second / time.Duration(packetsPerSecond))
|
||||
defer t.Stop()
|
||||
|
||||
@@ -481,13 +480,11 @@ type txRecord struct {
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
|
||||
// Receive the packets.
|
||||
recvFinishedC := make(chan error, 1)
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
wg.Go(func() {
|
||||
defer close(recvFinishedC) // to break out of 'select' below.
|
||||
fromDERPPubKey := fromc.SelfPublicKey()
|
||||
for {
|
||||
@@ -531,7 +528,7 @@ type txRecord struct {
|
||||
// Loop.
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
|
||||
Reference in New Issue
Block a user