diff --git a/wgengine/watchdog.go b/wgengine/watchdog.go index 18b36e003..f12b1c19e 100644 --- a/wgengine/watchdog.go +++ b/wgengine/watchdog.go @@ -22,12 +22,47 @@ "tailscale.com/tailcfg" "tailscale.com/types/key" "tailscale.com/types/netmap" + "tailscale.com/util/clientmetric" "tailscale.com/wgengine/filter" "tailscale.com/wgengine/router" "tailscale.com/wgengine/wgcfg" "tailscale.com/wgengine/wgint" ) +type watchdogEvent string + +const ( + Any watchdogEvent = "Any" + Reconfig watchdogEvent = "Reconfig" + ResetAndStop watchdogEvent = "ResetAndStop" + SetFilter watchdogEvent = "SetFilter" + SetJailedFilter watchdogEvent = "SetJailedFilter" + SetStatusCallback watchdogEvent = "SetStatusCallback" + UpdateStatus watchdogEvent = "UpdateStatus" + RequestStatus watchdogEvent = "RequestStatus" + SetNetworkMap watchdogEvent = "SetNetworkMap" + Ping watchdogEvent = "Ping" + Close watchdogEvent = "Close" + PeerForIPEvent watchdogEvent = "PeerForIP" +) + +var ( + watchdogMetrics = map[watchdogEvent]*clientmetric.Metric{ + Any: clientmetric.NewCounter("watchdog_timeout_any_total"), + Reconfig: clientmetric.NewCounter("watchdog_timeout_reconfig"), + ResetAndStop: clientmetric.NewCounter("watchdog_timeout_resetandstop"), + SetFilter: clientmetric.NewCounter("watchdog_timeout_setfilter"), + SetJailedFilter: clientmetric.NewCounter("watchdog_timeout_setjailedfilter"), + SetStatusCallback: clientmetric.NewCounter("watchdog_timeout_setstatuscallback"), + UpdateStatus: clientmetric.NewCounter("watchdog_timeout_updatestatus"), + RequestStatus: clientmetric.NewCounter("watchdog_timeout_requeststatus"), + SetNetworkMap: clientmetric.NewCounter("watchdog_timeout_setnetworkmap"), + Ping: clientmetric.NewCounter("watchdog_timeout_ping"), + Close: clientmetric.NewCounter("watchdog_timeout_close"), + PeerForIPEvent: clientmetric.NewCounter("watchdog_timeout_peerforipevent"), + } +) + // NewWatchdog wraps an Engine and makes sure that all methods complete // within a reasonable amount of time. // @@ -46,7 +81,7 @@ func NewWatchdog(e Engine) Engine { } type inFlightKey struct { - op string + op watchdogEvent ctr uint64 } @@ -62,12 +97,13 @@ type watchdogEngine struct { inFlightCtr uint64 } -func (e *watchdogEngine) watchdogErr(name string, fn func() error) error { +func (e *watchdogEngine) watchdogErr(event watchdogEvent, fn func() error) error { // Track all in-flight operations so we can print more useful error // messages on watchdog failure e.inFlightMu.Lock() + key := inFlightKey{ - op: name, + op: event, ctr: e.inFlightCtr, } e.inFlightCtr++ @@ -93,7 +129,6 @@ func (e *watchdogEngine) watchdogErr(name string, fn func() error) error { buf := new(strings.Builder) pprof.Lookup("goroutine").WriteTo(buf, 1) e.logf("wgengine watchdog stacks:\n%s", buf.String()) - // Collect the list of in-flight operations for debugging. var ( b []byte @@ -104,64 +139,92 @@ func (e *watchdogEngine) watchdogErr(name string, fn func() error) error { dur := now.Sub(t).Round(time.Millisecond) b = fmt.Appendf(b, "in-flight[%d]: name=%s duration=%v start=%s\n", k.ctr, k.op, dur, t.Format(time.RFC3339Nano)) } + e.recordEvent(event) e.inFlightMu.Unlock() // Print everything as a single string to avoid log // rate limits. e.logf("wgengine watchdog in-flight:\n%s", b) - e.fatalf("wgengine: watchdog timeout on %s", name) + e.fatalf("wgengine: watchdog timeout on %s", event) return nil } } -func (e *watchdogEngine) watchdog(name string, fn func()) { - e.watchdogErr(name, func() error { +func (e *watchdogEngine) recordEvent(event watchdogEvent) { + if watchdogMetrics == nil { + return + } + + mEvent, ok := watchdogMetrics[event] + if ok { + mEvent.Add(1) + } + mAny, ok := watchdogMetrics[Any] + if ok { + mAny.Add(1) + } +} + +func (e *watchdogEngine) watchdog(event watchdogEvent, fn func()) { + e.watchdogErr(event, func() error { fn() return nil }) } func (e *watchdogEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config, dnsCfg *dns.Config) error { - return e.watchdogErr("Reconfig", func() error { return e.wrap.Reconfig(cfg, routerCfg, dnsCfg) }) + return e.watchdogErr(Reconfig, func() error { return e.wrap.Reconfig(cfg, routerCfg, dnsCfg) }) } + func (e *watchdogEngine) ResetAndStop() (st *Status, err error) { - e.watchdog("ResetAndStop", func() { + e.watchdog(ResetAndStop, func() { st, err = e.wrap.ResetAndStop() }) return st, err } + func (e *watchdogEngine) GetFilter() *filter.Filter { return e.wrap.GetFilter() } + func (e *watchdogEngine) SetFilter(filt *filter.Filter) { - e.watchdog("SetFilter", func() { e.wrap.SetFilter(filt) }) + e.watchdog(SetFilter, func() { e.wrap.SetFilter(filt) }) } + func (e *watchdogEngine) GetJailedFilter() *filter.Filter { return e.wrap.GetJailedFilter() } + func (e *watchdogEngine) SetJailedFilter(filt *filter.Filter) { - e.watchdog("SetJailedFilter", func() { e.wrap.SetJailedFilter(filt) }) + e.watchdog(SetJailedFilter, func() { e.wrap.SetJailedFilter(filt) }) } + func (e *watchdogEngine) SetStatusCallback(cb StatusCallback) { - e.watchdog("SetStatusCallback", func() { e.wrap.SetStatusCallback(cb) }) + e.watchdog(SetStatusCallback, func() { e.wrap.SetStatusCallback(cb) }) } + func (e *watchdogEngine) UpdateStatus(sb *ipnstate.StatusBuilder) { - e.watchdog("UpdateStatus", func() { e.wrap.UpdateStatus(sb) }) + e.watchdog(UpdateStatus, func() { e.wrap.UpdateStatus(sb) }) } + func (e *watchdogEngine) RequestStatus() { - e.watchdog("RequestStatus", func() { e.wrap.RequestStatus() }) + e.watchdog(RequestStatus, func() { e.wrap.RequestStatus() }) } + func (e *watchdogEngine) SetNetworkMap(nm *netmap.NetworkMap) { - e.watchdog("SetNetworkMap", func() { e.wrap.SetNetworkMap(nm) }) + e.watchdog(SetNetworkMap, func() { e.wrap.SetNetworkMap(nm) }) } + func (e *watchdogEngine) Ping(ip netip.Addr, pingType tailcfg.PingType, size int, cb func(*ipnstate.PingResult)) { - e.watchdog("Ping", func() { e.wrap.Ping(ip, pingType, size, cb) }) + e.watchdog(Ping, func() { e.wrap.Ping(ip, pingType, size, cb) }) } + func (e *watchdogEngine) Close() { - e.watchdog("Close", e.wrap.Close) + e.watchdog(Close, e.wrap.Close) } + func (e *watchdogEngine) PeerForIP(ip netip.Addr) (ret PeerForIP, ok bool) { - e.watchdog("PeerForIP", func() { ret, ok = e.wrap.PeerForIP(ip) }) + e.watchdog(PeerForIPEvent, func() { ret, ok = e.wrap.PeerForIP(ip) }) return ret, ok } diff --git a/wgengine/watchdog_test.go b/wgengine/watchdog_test.go index 47f133373..803233957 100644 --- a/wgengine/watchdog_test.go +++ b/wgengine/watchdog_test.go @@ -1,10 +1,13 @@ // Copyright (c) Tailscale Inc & contributors // SPDX-License-Identifier: BSD-3-Clause +//go:build !js + package wgengine import ( "runtime" + "sync" "testing" "time" @@ -44,3 +47,95 @@ func TestWatchdog(t *testing.T) { e.Close() }) } + +func TestWatchdogMetrics(t *testing.T) { + tests := []struct { + name string + events []watchdogEvent + wantCounts map[watchdogEvent]int64 + }{ + { + name: "single event types", + events: []watchdogEvent{RequestStatus, PeerForIPEvent, Ping}, + wantCounts: map[watchdogEvent]int64{ + RequestStatus: 1, + PeerForIPEvent: 1, + Ping: 1, + }, + }, + { + name: "repeated events", + events: []watchdogEvent{RequestStatus, RequestStatus, Ping, RequestStatus}, + wantCounts: map[watchdogEvent]int64{ + RequestStatus: 3, + Ping: 1, + }, + }, + } + + // For swallowing fatalf calls and stack traces + logf := func(format string, args ...any) {} + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + clearMetrics(t) + bus := eventbustest.NewBus(t) + ht := health.NewTracker(bus) + reg := new(usermetric.Registry) + e, err := NewFakeUserspaceEngine(logf, 0, ht, reg, bus) + if err != nil { + t.Fatal(err) + } + e = NewWatchdog(e) + w := e.(*watchdogEngine) + w.maxWait = 1 * time.Microsecond + w.logf = logf + w.fatalf = logf + + var wg sync.WaitGroup + wg.Add(len(tt.events)) + + for _, ev := range tt.events { + blocked := make(chan struct{}) + w.watchdog(ev, func() { + defer wg.Done() + <-blocked + }) + close(blocked) + } + wg.Wait() + + // Check individual event counts + for ev, want := range tt.wantCounts { + m, ok := watchdogMetrics[ev] + if !ok { + t.Fatalf("no metric found for event %q", ev) + } + got := m.Value() + if got != want { + t.Errorf("got %d metric events for %q, want %d", got, ev, want) + } + } + + // Check total count for Any + m, ok := watchdogMetrics[Any] + if !ok { + t.Fatalf("no Any metric found") + } + got := m.Value() + if got != int64(len(tt.events)) { + t.Errorf("got %d metric events for Any, want %d", got, len(tt.events)) + } + }) + } +} + +func clearMetrics(t *testing.T) { + t.Helper() + if watchdogMetrics == nil { + return + } + for _, m := range watchdogMetrics { + m.Set(0) + } +}