wgengine/magicsock,types/logger: add latency logs for initial peer contacts (#19916)

In order to allow us to measure the performance effects of client-side netmap
caching, both with and without the feature enabled, add logs to record how long
it takes after a client restart or profile switch for the node to establish
contact with peers, relative to the first uncached netmap.

We do this by keeping track of a timestamp when the connection is constructed,
and logging a record for "new" peer contacts that records how long (in
microseconds) it took from the time the peer was recorded as a candidate.  The
message includes whether the contact was via DERP or direct, and whether a
cached netmap was in use at the time.

This builds on and extends the counters from #19699, but here we include new
contacts whether or not a cached netmap is in use, so that we can establish a
baseline for comparison.

Updates #12639
Updates tailscale/projects#27

Change-Id: I4f6d050e221f3881848d05a0425c4a5d1a59294c
Signed-off-by: M. J. Fromberger <fromberger@tailscale.com>
This commit is contained in:
M. J. Fromberger
2026-06-02 07:34:17 -07:00
committed by GitHub
parent c898aeb0d8
commit a3bec699dc
3 changed files with 46 additions and 15 deletions

View File

@@ -141,6 +141,7 @@ type limitData struct {
"magicsock: disco: ",
"magicsock: ParseEndpoint:",
// grinder stats lines
"magicsock: new contact: ",
"SetPrefs: %v",
"peer keys: %s",
"v%v peers: %v",

View File

@@ -139,11 +139,18 @@ func (de *endpoint) setBestAddrLocked(v addrQuality) {
if v.epAddr != de.bestAddr.epAddr {
de.probeUDPLifetime.resetCycleEndpointLocked()
// Reaching here, if we are using data from a cached netmap and we are
// upgrading from an invalid (missing) address to a valid one, increment
// the counter for peers established.
if !de.bestAddr.ap.IsValid() && v.ap.IsValid() && de.c.usingCachedNetmap.Load() {
metricCachedPeerContactDirect.Add(1)
// Reaching here, if we are upgrading from an invalid (missing) address
// to a valid one, record metrics:
if !de.bestAddr.ap.IsValid() && v.ap.IsValid() {
// If we are using data from a cached netmap, increment the counter for peers established.
isCached := de.c.usingCachedNetmap.Load()
if isCached {
metricCachedPeerContactDirect.Add(1)
}
// Regardless whether the netmap is cached, record how long it has
// been since the endpoint was initialized.
de.c.logf("magicsock: new contact: peer=%s usec=%d cached=%v via=direct",
de.publicKey.ShortString(), int64(mono.Since(de.c.initializedAt)/time.Microsecond), isCached)
}
}
de.bestAddr = v

View File

@@ -273,7 +273,7 @@ type Conn struct {
// discoAtomic is the current disco private and public keypair for this conn.
discoAtomic discoAtomic
// usingCacheNetmap is whether the latest update to self and peersByID are from a cached network map
// usingCacheNetmap is whether the latest update to self and peersByID are from a cached network map.
usingCachedNetmap atomic.Bool
// ============================================================
@@ -417,6 +417,11 @@ type Conn struct {
// metrics contains the metrics for the magicsock instance.
metrics *metrics
// initializedAt records the latest time at which the conn was
// (re)initialized for latency metrics. This is set at construction, and
// updated by netmap resets.
initializedAt mono.Time
// homeDERPGauge is the usermetric gauge for the home DERP region ID.
// This can be nil when [Options.Metrics] are not enabled.
homeDERPGauge *usermetric.Gauge
@@ -570,13 +575,14 @@ type UDPRelayAllocResp struct {
func newConn(logf logger.Logf) *Conn {
discoPrivate := key.NewDisco()
c := &Conn{
logf: logf,
derpRecvCh: make(chan derpReadResult, 1), // must be buffered, see issue 3736
derpStarted: make(chan struct{}),
peerLastDerp: make(map[key.NodePublic]int),
peerMap: newPeerMap(),
discoInfo: make(map[key.DiscoPublic]*discoInfo),
cloudInfo: cloudinfo.New(logf),
logf: logf,
derpRecvCh: make(chan derpReadResult, 1), // must be buffered, see issue 3736
derpStarted: make(chan struct{}),
peerLastDerp: make(map[key.NodePublic]int),
peerMap: newPeerMap(),
discoInfo: make(map[key.DiscoPublic]*discoInfo),
cloudInfo: cloudinfo.New(logf),
initializedAt: mono.Now(),
}
c.discoAtomic.Set(discoPrivate)
c.bind = &connBind{Conn: c, closed: true}
@@ -2362,9 +2368,16 @@ func (c *Conn) handleDiscoMessage(msg []byte, src epAddr, shouldBeRelayHandshake
// Reaching here, if we are using data from a cached network map the
// receipt of a CallMeMaybe from a peer indicates we have a sufficiently
// viable connection to that peer to count it as active while cached.
if c.usingCachedNetmap.Load() {
isCached := c.usingCachedNetmap.Load()
if isCached {
metricCachedPeerContactDERP.Add(1)
}
// If we did not already have an an endpoint for this peer, even a stale
// one, record how long it has been since the endpoint was initialized.
if !lastBest.ap.IsValid() {
c.logf("magicsock: new contact: peer=%s usec=%d cached=%v via=derp",
nodeKey.ShortString(), int64(mono.Since(c.initializedAt)/time.Microsecond), isCached)
}
if isVia {
c.dlogf("[v1] magicsock: disco: %v<-%v via %v (%v, %v) got call-me-maybe-via, %d endpoints",
c.discoAtomic.Short(), epDisco.short, via.ServerDisco.ShortString(),
@@ -3025,7 +3038,11 @@ func (c *Conn) setNetworkMapInternal(self tailcfg.NodeView, peers []tailcfg.Node
selfView := c.self
peersSnap := c.peerSnapshotLocked()
isClosed := c.closed
c.usingCachedNetmap.Store(isCached)
wasCached := c.usingCachedNetmap.Swap(isCached)
if !self.Valid() {
c.initializedAt = mono.Now() // the netmap is being reset
}
initializedAt := c.initializedAt
if runtime.GOOS == "linux" && c.controlKnobs != nil {
curGRO = c.controlKnobs.DisableUDPGRO.Load()
curGSO = c.controlKnobs.DisableUDPGSO.Load()
@@ -3040,6 +3057,12 @@ func (c *Conn) setNetworkMapInternal(self tailcfg.NodeView, peers []tailcfg.Node
if isClosed {
return // nothing to do here, the conn is closed and the update is no longer relevant
}
if self.Valid() && wasCached && !isCached {
// Reaching here, we have a "real" netmap (self is valid), and we are
// transitioning from cached to non-cached, so record the latency from
// reset.
c.logf("magicsock: new contact: control-netmap usec=%d", int64(mono.Since(initializedAt)/time.Microsecond))
}
if udpOffloadKnobsChanged {
// A control-plane node attribute toggled UDP GRO or UDP GSO. Rebind