From a3bec699dcdb6d434fe6c867b8c90f8368fac100 Mon Sep 17 00:00:00 2001 From: "M. J. Fromberger" Date: Tue, 2 Jun 2026 07:34:17 -0700 Subject: [PATCH] wgengine/magicsock,types/logger: add latency logs for initial peer contacts (#19916) In order to allow us to measure the performance effects of client-side netmap caching, both with and without the feature enabled, add logs to record how long it takes after a client restart or profile switch for the node to establish contact with peers, relative to the first uncached netmap. We do this by keeping track of a timestamp when the connection is constructed, and logging a record for "new" peer contacts that records how long (in microseconds) it took from the time the peer was recorded as a candidate. The message includes whether the contact was via DERP or direct, and whether a cached netmap was in use at the time. This builds on and extends the counters from #19699, but here we include new contacts whether or not a cached netmap is in use, so that we can establish a baseline for comparison. Updates #12639 Updates tailscale/projects#27 Change-Id: I4f6d050e221f3881848d05a0425c4a5d1a59294c Signed-off-by: M. J. Fromberger --- types/logger/logger.go | 1 + wgengine/magicsock/endpoint.go | 17 +++++++++---- wgengine/magicsock/magicsock.go | 43 +++++++++++++++++++++++++-------- 3 files changed, 46 insertions(+), 15 deletions(-) diff --git a/types/logger/logger.go b/types/logger/logger.go index 71086e87d..f76bbf19c 100644 --- a/types/logger/logger.go +++ b/types/logger/logger.go @@ -141,6 +141,7 @@ type limitData struct { "magicsock: disco: ", "magicsock: ParseEndpoint:", // grinder stats lines + "magicsock: new contact: ", "SetPrefs: %v", "peer keys: %s", "v%v peers: %v", diff --git a/wgengine/magicsock/endpoint.go b/wgengine/magicsock/endpoint.go index d1430ca4e..515b15dc5 100644 --- a/wgengine/magicsock/endpoint.go +++ b/wgengine/magicsock/endpoint.go @@ -139,11 +139,18 @@ func (de *endpoint) setBestAddrLocked(v addrQuality) { if v.epAddr != de.bestAddr.epAddr { de.probeUDPLifetime.resetCycleEndpointLocked() - // Reaching here, if we are using data from a cached netmap and we are - // upgrading from an invalid (missing) address to a valid one, increment - // the counter for peers established. - if !de.bestAddr.ap.IsValid() && v.ap.IsValid() && de.c.usingCachedNetmap.Load() { - metricCachedPeerContactDirect.Add(1) + // Reaching here, if we are upgrading from an invalid (missing) address + // to a valid one, record metrics: + if !de.bestAddr.ap.IsValid() && v.ap.IsValid() { + // If we are using data from a cached netmap, increment the counter for peers established. + isCached := de.c.usingCachedNetmap.Load() + if isCached { + metricCachedPeerContactDirect.Add(1) + } + // Regardless whether the netmap is cached, record how long it has + // been since the endpoint was initialized. + de.c.logf("magicsock: new contact: peer=%s usec=%d cached=%v via=direct", + de.publicKey.ShortString(), int64(mono.Since(de.c.initializedAt)/time.Microsecond), isCached) } } de.bestAddr = v diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index 0a9a2d711..4fe0fa29f 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -273,7 +273,7 @@ type Conn struct { // discoAtomic is the current disco private and public keypair for this conn. discoAtomic discoAtomic - // usingCacheNetmap is whether the latest update to self and peersByID are from a cached network map + // usingCacheNetmap is whether the latest update to self and peersByID are from a cached network map. usingCachedNetmap atomic.Bool // ============================================================ @@ -417,6 +417,11 @@ type Conn struct { // metrics contains the metrics for the magicsock instance. metrics *metrics + // initializedAt records the latest time at which the conn was + // (re)initialized for latency metrics. This is set at construction, and + // updated by netmap resets. + initializedAt mono.Time + // homeDERPGauge is the usermetric gauge for the home DERP region ID. // This can be nil when [Options.Metrics] are not enabled. homeDERPGauge *usermetric.Gauge @@ -570,13 +575,14 @@ type UDPRelayAllocResp struct { func newConn(logf logger.Logf) *Conn { discoPrivate := key.NewDisco() c := &Conn{ - logf: logf, - derpRecvCh: make(chan derpReadResult, 1), // must be buffered, see issue 3736 - derpStarted: make(chan struct{}), - peerLastDerp: make(map[key.NodePublic]int), - peerMap: newPeerMap(), - discoInfo: make(map[key.DiscoPublic]*discoInfo), - cloudInfo: cloudinfo.New(logf), + logf: logf, + derpRecvCh: make(chan derpReadResult, 1), // must be buffered, see issue 3736 + derpStarted: make(chan struct{}), + peerLastDerp: make(map[key.NodePublic]int), + peerMap: newPeerMap(), + discoInfo: make(map[key.DiscoPublic]*discoInfo), + cloudInfo: cloudinfo.New(logf), + initializedAt: mono.Now(), } c.discoAtomic.Set(discoPrivate) c.bind = &connBind{Conn: c, closed: true} @@ -2362,9 +2368,16 @@ func (c *Conn) handleDiscoMessage(msg []byte, src epAddr, shouldBeRelayHandshake // Reaching here, if we are using data from a cached network map the // receipt of a CallMeMaybe from a peer indicates we have a sufficiently // viable connection to that peer to count it as active while cached. - if c.usingCachedNetmap.Load() { + isCached := c.usingCachedNetmap.Load() + if isCached { metricCachedPeerContactDERP.Add(1) } + // If we did not already have an an endpoint for this peer, even a stale + // one, record how long it has been since the endpoint was initialized. + if !lastBest.ap.IsValid() { + c.logf("magicsock: new contact: peer=%s usec=%d cached=%v via=derp", + nodeKey.ShortString(), int64(mono.Since(c.initializedAt)/time.Microsecond), isCached) + } if isVia { c.dlogf("[v1] magicsock: disco: %v<-%v via %v (%v, %v) got call-me-maybe-via, %d endpoints", c.discoAtomic.Short(), epDisco.short, via.ServerDisco.ShortString(), @@ -3025,7 +3038,11 @@ func (c *Conn) setNetworkMapInternal(self tailcfg.NodeView, peers []tailcfg.Node selfView := c.self peersSnap := c.peerSnapshotLocked() isClosed := c.closed - c.usingCachedNetmap.Store(isCached) + wasCached := c.usingCachedNetmap.Swap(isCached) + if !self.Valid() { + c.initializedAt = mono.Now() // the netmap is being reset + } + initializedAt := c.initializedAt if runtime.GOOS == "linux" && c.controlKnobs != nil { curGRO = c.controlKnobs.DisableUDPGRO.Load() curGSO = c.controlKnobs.DisableUDPGSO.Load() @@ -3040,6 +3057,12 @@ func (c *Conn) setNetworkMapInternal(self tailcfg.NodeView, peers []tailcfg.Node if isClosed { return // nothing to do here, the conn is closed and the update is no longer relevant } + if self.Valid() && wasCached && !isCached { + // Reaching here, we have a "real" netmap (self is valid), and we are + // transitioning from cached to non-cached, so record the latency from + // reset. + c.logf("magicsock: new contact: control-netmap usec=%d", int64(mono.Since(initializedAt)/time.Microsecond)) + } if udpOffloadKnobsChanged { // A control-plane node attribute toggled UDP GRO or UDP GSO. Rebind