mirror of
https://github.com/tailscale/tailscale.git
synced 2026-04-03 22:25:27 -04:00
When disco keys are learned on a node that is connected to control and has a mapSession, wgengine will see the key as having changed, and assume that any existing connections will need to be reset. For keys learned via TSMP, the connection should not be reset as that key is learned via an active wireguard connection. If wgengine resets that connetion, a 15s timeout will occur. This change adds a map to track new keys coming in via TSMP, and removes them from the list of keys that needs to trigger wireguard resets. This is done with an interface chain from controlclient down via localBackend to userspaceEngine via the watchdog. Once a key has been actively used for preventing a wireguard reset, the key is removed from the map. If mapSession becomes a long lived process instead of being dependent on having a connection to control. This interface chain can be removed, and the event sequence from wrap->controlClient->userspaceEngine, can be changed to wrap->userspaceEngine->controlClient as we know the map will not be gunked up with stale TSMP entries. Updates #12639 Signed-off-by: Claus Lensbøl <claus@tailscale.com>
257 lines
7.1 KiB
Go
257 lines
7.1 KiB
Go
// Copyright (c) Tailscale Inc & contributors
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
//go:build !js && !ts_omit_debug
|
|
|
|
package wgengine
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"net/netip"
|
|
"runtime/pprof"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"tailscale.com/envknob"
|
|
"tailscale.com/feature/buildfeatures"
|
|
"tailscale.com/ipn/ipnstate"
|
|
"tailscale.com/net/dns"
|
|
"tailscale.com/net/packet"
|
|
"tailscale.com/tailcfg"
|
|
"tailscale.com/types/key"
|
|
"tailscale.com/types/netmap"
|
|
"tailscale.com/util/clientmetric"
|
|
"tailscale.com/wgengine/filter"
|
|
"tailscale.com/wgengine/router"
|
|
"tailscale.com/wgengine/wgcfg"
|
|
"tailscale.com/wgengine/wgint"
|
|
)
|
|
|
|
type watchdogEvent string
|
|
|
|
const (
|
|
Any watchdogEvent = "Any"
|
|
Reconfig watchdogEvent = "Reconfig"
|
|
ResetAndStop watchdogEvent = "ResetAndStop"
|
|
SetFilter watchdogEvent = "SetFilter"
|
|
SetJailedFilter watchdogEvent = "SetJailedFilter"
|
|
SetStatusCallback watchdogEvent = "SetStatusCallback"
|
|
UpdateStatus watchdogEvent = "UpdateStatus"
|
|
RequestStatus watchdogEvent = "RequestStatus"
|
|
SetNetworkMap watchdogEvent = "SetNetworkMap"
|
|
Ping watchdogEvent = "Ping"
|
|
Close watchdogEvent = "Close"
|
|
PeerForIPEvent watchdogEvent = "PeerForIP"
|
|
)
|
|
|
|
var (
|
|
watchdogMetrics = map[watchdogEvent]*clientmetric.Metric{
|
|
Any: clientmetric.NewCounter("watchdog_timeout_any_total"),
|
|
Reconfig: clientmetric.NewCounter("watchdog_timeout_reconfig"),
|
|
ResetAndStop: clientmetric.NewCounter("watchdog_timeout_resetandstop"),
|
|
SetFilter: clientmetric.NewCounter("watchdog_timeout_setfilter"),
|
|
SetJailedFilter: clientmetric.NewCounter("watchdog_timeout_setjailedfilter"),
|
|
SetStatusCallback: clientmetric.NewCounter("watchdog_timeout_setstatuscallback"),
|
|
UpdateStatus: clientmetric.NewCounter("watchdog_timeout_updatestatus"),
|
|
RequestStatus: clientmetric.NewCounter("watchdog_timeout_requeststatus"),
|
|
SetNetworkMap: clientmetric.NewCounter("watchdog_timeout_setnetworkmap"),
|
|
Ping: clientmetric.NewCounter("watchdog_timeout_ping"),
|
|
Close: clientmetric.NewCounter("watchdog_timeout_close"),
|
|
PeerForIPEvent: clientmetric.NewCounter("watchdog_timeout_peerforipevent"),
|
|
}
|
|
)
|
|
|
|
// NewWatchdog wraps an Engine and makes sure that all methods complete
|
|
// within a reasonable amount of time.
|
|
//
|
|
// If they do not, the watchdog crashes the process.
|
|
func NewWatchdog(e Engine) Engine {
|
|
if envknob.Bool("TS_DEBUG_DISABLE_WATCHDOG") {
|
|
return e
|
|
}
|
|
return &watchdogEngine{
|
|
wrap: e,
|
|
logf: log.Printf,
|
|
fatalf: log.Fatalf,
|
|
maxWait: 45 * time.Second,
|
|
inFlight: make(map[inFlightKey]time.Time),
|
|
}
|
|
}
|
|
|
|
type inFlightKey struct {
|
|
op watchdogEvent
|
|
ctr uint64
|
|
}
|
|
|
|
type watchdogEngine struct {
|
|
wrap Engine
|
|
logf func(format string, args ...any)
|
|
fatalf func(format string, args ...any)
|
|
maxWait time.Duration
|
|
|
|
// Track the start time(s) of in-flight operations
|
|
inFlightMu sync.Mutex
|
|
inFlight map[inFlightKey]time.Time
|
|
inFlightCtr uint64
|
|
}
|
|
|
|
func (e *watchdogEngine) watchdogErr(event watchdogEvent, fn func() error) error {
|
|
// Track all in-flight operations so we can print more useful error
|
|
// messages on watchdog failure
|
|
e.inFlightMu.Lock()
|
|
|
|
key := inFlightKey{
|
|
op: event,
|
|
ctr: e.inFlightCtr,
|
|
}
|
|
e.inFlightCtr++
|
|
e.inFlight[key] = time.Now()
|
|
e.inFlightMu.Unlock()
|
|
|
|
defer func() {
|
|
e.inFlightMu.Lock()
|
|
defer e.inFlightMu.Unlock()
|
|
delete(e.inFlight, key)
|
|
}()
|
|
|
|
errCh := make(chan error)
|
|
go func() {
|
|
errCh <- fn()
|
|
}()
|
|
t := time.NewTimer(e.maxWait)
|
|
select {
|
|
case err := <-errCh:
|
|
t.Stop()
|
|
return err
|
|
case <-t.C:
|
|
buf := new(strings.Builder)
|
|
pprof.Lookup("goroutine").WriteTo(buf, 1)
|
|
e.logf("wgengine watchdog stacks:\n%s", buf.String())
|
|
// Collect the list of in-flight operations for debugging.
|
|
var (
|
|
b []byte
|
|
now = time.Now()
|
|
)
|
|
e.inFlightMu.Lock()
|
|
for k, t := range e.inFlight {
|
|
dur := now.Sub(t).Round(time.Millisecond)
|
|
b = fmt.Appendf(b, "in-flight[%d]: name=%s duration=%v start=%s\n", k.ctr, k.op, dur, t.Format(time.RFC3339Nano))
|
|
}
|
|
e.recordEvent(event)
|
|
e.inFlightMu.Unlock()
|
|
|
|
// Print everything as a single string to avoid log
|
|
// rate limits.
|
|
e.logf("wgengine watchdog in-flight:\n%s", b)
|
|
e.fatalf("wgengine: watchdog timeout on %s", event)
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func (e *watchdogEngine) recordEvent(event watchdogEvent) {
|
|
if watchdogMetrics == nil {
|
|
return
|
|
}
|
|
|
|
mEvent, ok := watchdogMetrics[event]
|
|
if ok {
|
|
mEvent.Add(1)
|
|
}
|
|
mAny, ok := watchdogMetrics[Any]
|
|
if ok {
|
|
mAny.Add(1)
|
|
}
|
|
}
|
|
|
|
func (e *watchdogEngine) watchdog(event watchdogEvent, fn func()) {
|
|
e.watchdogErr(event, func() error {
|
|
fn()
|
|
return nil
|
|
})
|
|
}
|
|
|
|
func (e *watchdogEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config, dnsCfg *dns.Config) error {
|
|
return e.watchdogErr(Reconfig, func() error { return e.wrap.Reconfig(cfg, routerCfg, dnsCfg) })
|
|
}
|
|
|
|
func (e *watchdogEngine) ResetAndStop() (st *Status, err error) {
|
|
e.watchdog(ResetAndStop, func() {
|
|
st, err = e.wrap.ResetAndStop()
|
|
})
|
|
return st, err
|
|
}
|
|
|
|
func (e *watchdogEngine) GetFilter() *filter.Filter {
|
|
return e.wrap.GetFilter()
|
|
}
|
|
|
|
func (e *watchdogEngine) SetFilter(filt *filter.Filter) {
|
|
e.watchdog(SetFilter, func() { e.wrap.SetFilter(filt) })
|
|
}
|
|
|
|
func (e *watchdogEngine) GetJailedFilter() *filter.Filter {
|
|
return e.wrap.GetJailedFilter()
|
|
}
|
|
|
|
func (e *watchdogEngine) SetJailedFilter(filt *filter.Filter) {
|
|
e.watchdog(SetJailedFilter, func() { e.wrap.SetJailedFilter(filt) })
|
|
}
|
|
|
|
func (e *watchdogEngine) SetStatusCallback(cb StatusCallback) {
|
|
e.watchdog(SetStatusCallback, func() { e.wrap.SetStatusCallback(cb) })
|
|
}
|
|
|
|
func (e *watchdogEngine) UpdateStatus(sb *ipnstate.StatusBuilder) {
|
|
e.watchdog(UpdateStatus, func() { e.wrap.UpdateStatus(sb) })
|
|
}
|
|
|
|
func (e *watchdogEngine) RequestStatus() {
|
|
e.watchdog(RequestStatus, func() { e.wrap.RequestStatus() })
|
|
}
|
|
|
|
func (e *watchdogEngine) SetNetworkMap(nm *netmap.NetworkMap) {
|
|
e.watchdog(SetNetworkMap, func() { e.wrap.SetNetworkMap(nm) })
|
|
}
|
|
|
|
func (e *watchdogEngine) Ping(ip netip.Addr, pingType tailcfg.PingType, size int, cb func(*ipnstate.PingResult)) {
|
|
e.watchdog(Ping, func() { e.wrap.Ping(ip, pingType, size, cb) })
|
|
}
|
|
|
|
func (e *watchdogEngine) Close() {
|
|
e.watchdog(Close, e.wrap.Close)
|
|
}
|
|
|
|
func (e *watchdogEngine) PeerForIP(ip netip.Addr) (ret PeerForIP, ok bool) {
|
|
e.watchdog(PeerForIPEvent, func() { ret, ok = e.wrap.PeerForIP(ip) })
|
|
return ret, ok
|
|
}
|
|
|
|
func (e *watchdogEngine) Done() <-chan struct{} {
|
|
return e.wrap.Done()
|
|
}
|
|
|
|
func (e *watchdogEngine) InstallCaptureHook(cb packet.CaptureCallback) {
|
|
if !buildfeatures.HasCapture {
|
|
return
|
|
}
|
|
e.wrap.InstallCaptureHook(cb)
|
|
}
|
|
|
|
func (e *watchdogEngine) PeerByKey(pubKey key.NodePublic) (_ wgint.Peer, ok bool) {
|
|
return e.wrap.PeerByKey(pubKey)
|
|
}
|
|
|
|
func (e *watchdogEngine) PatchDiscoKey(pub key.NodePublic, disco key.DiscoPublic) {
|
|
// PatchDiscoKey mirrors the implementation of [controlclient.patchDiscoKeyer ].
|
|
// It is implemented here to avoid the dependency edge to controlclient, but must be kept
|
|
// in sync with the original implementation.
|
|
type patchDiscoKeyer interface {
|
|
PatchDiscoKey(key.NodePublic, key.DiscoPublic)
|
|
}
|
|
if n, ok := e.wrap.(patchDiscoKeyer); ok {
|
|
n.PatchDiscoKey(pub, disco)
|
|
}
|
|
}
|