mirror of
https://github.com/tailscale/tailscale.git
synced 2026-06-23 23:41:41 -04:00
wgengine/magicsock,types/logger: add latency logs for initial peer contacts (#19916)
In order to allow us to measure the performance effects of client-side netmap caching, both with and without the feature enabled, add logs to record how long it takes after a client restart or profile switch for the node to establish contact with peers, relative to the first uncached netmap. We do this by keeping track of a timestamp when the connection is constructed, and logging a record for "new" peer contacts that records how long (in microseconds) it took from the time the peer was recorded as a candidate. The message includes whether the contact was via DERP or direct, and whether a cached netmap was in use at the time. This builds on and extends the counters from #19699, but here we include new contacts whether or not a cached netmap is in use, so that we can establish a baseline for comparison. Updates #12639 Updates tailscale/projects#27 Change-Id: I4f6d050e221f3881848d05a0425c4a5d1a59294c Signed-off-by: M. J. Fromberger <fromberger@tailscale.com>
This commit is contained in:
@@ -141,6 +141,7 @@ type limitData struct {
|
||||
"magicsock: disco: ",
|
||||
"magicsock: ParseEndpoint:",
|
||||
// grinder stats lines
|
||||
"magicsock: new contact: ",
|
||||
"SetPrefs: %v",
|
||||
"peer keys: %s",
|
||||
"v%v peers: %v",
|
||||
|
||||
@@ -139,11 +139,18 @@ func (de *endpoint) setBestAddrLocked(v addrQuality) {
|
||||
if v.epAddr != de.bestAddr.epAddr {
|
||||
de.probeUDPLifetime.resetCycleEndpointLocked()
|
||||
|
||||
// Reaching here, if we are using data from a cached netmap and we are
|
||||
// upgrading from an invalid (missing) address to a valid one, increment
|
||||
// the counter for peers established.
|
||||
if !de.bestAddr.ap.IsValid() && v.ap.IsValid() && de.c.usingCachedNetmap.Load() {
|
||||
metricCachedPeerContactDirect.Add(1)
|
||||
// Reaching here, if we are upgrading from an invalid (missing) address
|
||||
// to a valid one, record metrics:
|
||||
if !de.bestAddr.ap.IsValid() && v.ap.IsValid() {
|
||||
// If we are using data from a cached netmap, increment the counter for peers established.
|
||||
isCached := de.c.usingCachedNetmap.Load()
|
||||
if isCached {
|
||||
metricCachedPeerContactDirect.Add(1)
|
||||
}
|
||||
// Regardless whether the netmap is cached, record how long it has
|
||||
// been since the endpoint was initialized.
|
||||
de.c.logf("magicsock: new contact: peer=%s usec=%d cached=%v via=direct",
|
||||
de.publicKey.ShortString(), int64(mono.Since(de.c.initializedAt)/time.Microsecond), isCached)
|
||||
}
|
||||
}
|
||||
de.bestAddr = v
|
||||
|
||||
@@ -273,7 +273,7 @@ type Conn struct {
|
||||
// discoAtomic is the current disco private and public keypair for this conn.
|
||||
discoAtomic discoAtomic
|
||||
|
||||
// usingCacheNetmap is whether the latest update to self and peersByID are from a cached network map
|
||||
// usingCacheNetmap is whether the latest update to self and peersByID are from a cached network map.
|
||||
usingCachedNetmap atomic.Bool
|
||||
|
||||
// ============================================================
|
||||
@@ -417,6 +417,11 @@ type Conn struct {
|
||||
// metrics contains the metrics for the magicsock instance.
|
||||
metrics *metrics
|
||||
|
||||
// initializedAt records the latest time at which the conn was
|
||||
// (re)initialized for latency metrics. This is set at construction, and
|
||||
// updated by netmap resets.
|
||||
initializedAt mono.Time
|
||||
|
||||
// homeDERPGauge is the usermetric gauge for the home DERP region ID.
|
||||
// This can be nil when [Options.Metrics] are not enabled.
|
||||
homeDERPGauge *usermetric.Gauge
|
||||
@@ -570,13 +575,14 @@ type UDPRelayAllocResp struct {
|
||||
func newConn(logf logger.Logf) *Conn {
|
||||
discoPrivate := key.NewDisco()
|
||||
c := &Conn{
|
||||
logf: logf,
|
||||
derpRecvCh: make(chan derpReadResult, 1), // must be buffered, see issue 3736
|
||||
derpStarted: make(chan struct{}),
|
||||
peerLastDerp: make(map[key.NodePublic]int),
|
||||
peerMap: newPeerMap(),
|
||||
discoInfo: make(map[key.DiscoPublic]*discoInfo),
|
||||
cloudInfo: cloudinfo.New(logf),
|
||||
logf: logf,
|
||||
derpRecvCh: make(chan derpReadResult, 1), // must be buffered, see issue 3736
|
||||
derpStarted: make(chan struct{}),
|
||||
peerLastDerp: make(map[key.NodePublic]int),
|
||||
peerMap: newPeerMap(),
|
||||
discoInfo: make(map[key.DiscoPublic]*discoInfo),
|
||||
cloudInfo: cloudinfo.New(logf),
|
||||
initializedAt: mono.Now(),
|
||||
}
|
||||
c.discoAtomic.Set(discoPrivate)
|
||||
c.bind = &connBind{Conn: c, closed: true}
|
||||
@@ -2362,9 +2368,16 @@ func (c *Conn) handleDiscoMessage(msg []byte, src epAddr, shouldBeRelayHandshake
|
||||
// Reaching here, if we are using data from a cached network map the
|
||||
// receipt of a CallMeMaybe from a peer indicates we have a sufficiently
|
||||
// viable connection to that peer to count it as active while cached.
|
||||
if c.usingCachedNetmap.Load() {
|
||||
isCached := c.usingCachedNetmap.Load()
|
||||
if isCached {
|
||||
metricCachedPeerContactDERP.Add(1)
|
||||
}
|
||||
// If we did not already have an an endpoint for this peer, even a stale
|
||||
// one, record how long it has been since the endpoint was initialized.
|
||||
if !lastBest.ap.IsValid() {
|
||||
c.logf("magicsock: new contact: peer=%s usec=%d cached=%v via=derp",
|
||||
nodeKey.ShortString(), int64(mono.Since(c.initializedAt)/time.Microsecond), isCached)
|
||||
}
|
||||
if isVia {
|
||||
c.dlogf("[v1] magicsock: disco: %v<-%v via %v (%v, %v) got call-me-maybe-via, %d endpoints",
|
||||
c.discoAtomic.Short(), epDisco.short, via.ServerDisco.ShortString(),
|
||||
@@ -3025,7 +3038,11 @@ func (c *Conn) setNetworkMapInternal(self tailcfg.NodeView, peers []tailcfg.Node
|
||||
selfView := c.self
|
||||
peersSnap := c.peerSnapshotLocked()
|
||||
isClosed := c.closed
|
||||
c.usingCachedNetmap.Store(isCached)
|
||||
wasCached := c.usingCachedNetmap.Swap(isCached)
|
||||
if !self.Valid() {
|
||||
c.initializedAt = mono.Now() // the netmap is being reset
|
||||
}
|
||||
initializedAt := c.initializedAt
|
||||
if runtime.GOOS == "linux" && c.controlKnobs != nil {
|
||||
curGRO = c.controlKnobs.DisableUDPGRO.Load()
|
||||
curGSO = c.controlKnobs.DisableUDPGSO.Load()
|
||||
@@ -3040,6 +3057,12 @@ func (c *Conn) setNetworkMapInternal(self tailcfg.NodeView, peers []tailcfg.Node
|
||||
if isClosed {
|
||||
return // nothing to do here, the conn is closed and the update is no longer relevant
|
||||
}
|
||||
if self.Valid() && wasCached && !isCached {
|
||||
// Reaching here, we have a "real" netmap (self is valid), and we are
|
||||
// transitioning from cached to non-cached, so record the latency from
|
||||
// reset.
|
||||
c.logf("magicsock: new contact: control-netmap usec=%d", int64(mono.Since(initializedAt)/time.Microsecond))
|
||||
}
|
||||
|
||||
if udpOffloadKnobsChanged {
|
||||
// A control-plane node attribute toggled UDP GRO or UDP GSO. Rebind
|
||||
|
||||
Reference in New Issue
Block a user