diff --git a/cmd/tailscaled/depaware.txt b/cmd/tailscaled/depaware.txt index 88c433757..e59fcaf2a 100644 --- a/cmd/tailscaled/depaware.txt +++ b/cmd/tailscaled/depaware.txt @@ -304,6 +304,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de tailscale.com/feature/posture from tailscale.com/feature/condregister tailscale.com/feature/relayserver from tailscale.com/feature/condregister tailscale.com/feature/routecheck from tailscale.com/feature/condregister + tailscale.com/feature/runtimemetrics from tailscale.com/feature/condregister L tailscale.com/feature/sdnotify from tailscale.com/feature/condregister LD tailscale.com/feature/ssh from tailscale.com/cmd/tailscaled tailscale.com/feature/syspolicy from tailscale.com/feature/condregister+ @@ -756,6 +757,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de regexp/syntax from regexp runtime from archive/tar+ runtime/debug from github.com/aws/aws-sdk-go-v2/internal/sync/singleflight+ + runtime/metrics from tailscale.com/feature/runtimemetrics runtime/pprof from net/http/pprof+ runtime/trace from net/http/pprof slices from tailscale.com/appc+ diff --git a/control/controlknobs/controlknobs.go b/control/controlknobs/controlknobs.go index 77a496349..36e3b6d40 100644 --- a/control/controlknobs/controlknobs.go +++ b/control/controlknobs/controlknobs.go @@ -110,6 +110,10 @@ type Knobs struct { // See https://github.com/tailscale/tailscale/issues/15404. // TODO(bradfitz): remove this a few releases after 2026-02-16. ForceRegisterMagicDNSIPv4Only atomic.Bool + + // EmitRuntimeMetrics is whether the node should poll and emit [runtime/metrics] + // as [tailscale.com/util/clientmetric]'s. + EmitRuntimeMetrics atomic.Bool } // UpdateFromNodeAttributes updates k (if non-nil) based on the provided self @@ -139,6 +143,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) { disableSkipStatusQueue = has(tailcfg.NodeAttrDisableSkipStatusQueue) disableHostsFileUpdates = has(tailcfg.NodeAttrDisableHostsFileUpdates) forceRegisterMagicDNSIPv4Only = has(tailcfg.NodeAttrForceRegisterMagicDNSIPv4Only) + emitRuntimeMetrics = has(tailcfg.NodeAttrEmitRuntimeMetrics) ) if has(tailcfg.NodeAttrOneCGNATEnable) { @@ -166,6 +171,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) { k.DisableSkipStatusQueue.Store(disableSkipStatusQueue) k.DisableHostsFileUpdates.Store(disableHostsFileUpdates) k.ForceRegisterMagicDNSIPv4Only.Store(forceRegisterMagicDNSIPv4Only) + k.EmitRuntimeMetrics.Store(emitRuntimeMetrics) } // AsDebugJSON returns k as something that can be marshalled with json.Marshal diff --git a/feature/buildfeatures/feature_runtimemetrics_disabled.go b/feature/buildfeatures/feature_runtimemetrics_disabled.go new file mode 100644 index 000000000..9e582bdd8 --- /dev/null +++ b/feature/buildfeatures/feature_runtimemetrics_disabled.go @@ -0,0 +1,13 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +// Code generated by gen.go; DO NOT EDIT. + +//go:build ts_omit_runtimemetrics + +package buildfeatures + +// HasRuntimeMetrics is whether the binary was built with support for modular feature "Support emission of runtime/metrics as clientmetrics". +// Specifically, it's whether the binary was NOT built with the "ts_omit_runtimemetrics" build tag. +// It's a const so it can be used for dead code elimination. +const HasRuntimeMetrics = false diff --git a/feature/buildfeatures/feature_runtimemetrics_enabled.go b/feature/buildfeatures/feature_runtimemetrics_enabled.go new file mode 100644 index 000000000..c95bb83cb --- /dev/null +++ b/feature/buildfeatures/feature_runtimemetrics_enabled.go @@ -0,0 +1,13 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +// Code generated by gen.go; DO NOT EDIT. + +//go:build !ts_omit_runtimemetrics + +package buildfeatures + +// HasRuntimeMetrics is whether the binary was built with support for modular feature "Support emission of runtime/metrics as clientmetrics". +// Specifically, it's whether the binary was NOT built with the "ts_omit_runtimemetrics" build tag. +// It's a const so it can be used for dead code elimination. +const HasRuntimeMetrics = true diff --git a/feature/condregister/maybe_runtimemetrics.go b/feature/condregister/maybe_runtimemetrics.go new file mode 100644 index 000000000..7b0883ca2 --- /dev/null +++ b/feature/condregister/maybe_runtimemetrics.go @@ -0,0 +1,8 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !ts_omit_runtimemetrics + +package condregister + +import _ "tailscale.com/feature/runtimemetrics" diff --git a/feature/featuretags/featuretags.go b/feature/featuretags/featuretags.go index 2c12b6960..0c2099d52 100644 --- a/feature/featuretags/featuretags.go +++ b/feature/featuretags/featuretags.go @@ -239,6 +239,10 @@ type FeatureMeta struct { Sym: "RouteCheck", Desc: "Support checking the reachability of overlapping routers, for choosing between multiple network paths to the same IP address", }, + "runtimemetrics": { + Sym: "RuntimeMetrics", + Desc: "Support emission of runtime/metrics as clientmetrics", + }, "sdnotify": { Sym: "SDNotify", Desc: "systemd notification support", diff --git a/feature/runtimemetrics/runtimemetrics.go b/feature/runtimemetrics/runtimemetrics.go new file mode 100644 index 000000000..fd9b54ee9 --- /dev/null +++ b/feature/runtimemetrics/runtimemetrics.go @@ -0,0 +1,132 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +// Package runtimemetrics exports select runtime/metrics as [tailscale.com/util/clientmetric]'s. +package runtimemetrics + +import ( + "runtime/metrics" + "sync" + "time" + + "tailscale.com/feature" + "tailscale.com/ipn/ipnlocal" + "tailscale.com/util/clientmetric" +) + +func init() { + feature.Register("runtimemetrics") + ipnlocal.HookSetRuntimeMetricsEnabled.Set(setEnabled) +} + +var ( + setEnabledMu sync.Mutex // guards runningPoller + runningPoller *poller // non-nil when running, otherwise nil +) + +func setEnabled(enabled bool) { + setEnabledMu.Lock() + defer setEnabledMu.Unlock() + if enabled { + if runningPoller != nil { + return + } + runningPoller = newPoller() + } else { + if runningPoller == nil { + return + } + runningPoller.close() + runningPoller = nil + } +} + +type poller struct { + closeOnce sync.Once + closeCh chan struct{} + wg sync.WaitGroup +} + +func newPoller() *poller { + p := &poller{ + closeCh: make(chan struct{}), + } + p.wg.Go(p.run) + return p +} + +func (p *poller) close() { + p.closeOnce.Do(func() { + close(p.closeCh) + p.wg.Wait() + }) +} + +const ( + // pollInterval is how frequently [poller] polls Go runtime metrics. Its + // value mirrors [tailscale.com/util/clientmetric.minMetricEncodeInterval], + // which is the minimum interval between clientmetrics emissions. + pollInterval = 15 * time.Second +) + +type sampleNameClientmetric struct { + sampleName string // [metrics.Sample.Name] + clientmetricName string // passed to clientmetric.New... + metric *clientmetric.Metric // lazy init on first pollAndEmit +} + +var clientmetrics = []sampleNameClientmetric{ + { + // Memory occupied by live objects and dead objects that have not + // yet been marked free by the garbage collector. + sampleName: "/memory/classes/heap/objects:bytes", + clientmetricName: "runtimemetrics_memory_heap_objects_bytes", + }, + { + // All memory mapped by the Go runtime into the current process + // as read-write. Note that this does not include memory mapped + // by code called via cgo or via the syscall package. Sum of all + // metrics in /memory/classes. + sampleName: "/memory/classes/total:bytes", + clientmetricName: "runtimemetrics_memory_total_bytes", + }, +} + +var registerClientmetricsOnce sync.Once + +func exportSamples(samples []metrics.Sample) { + registerClientmetricsOnce.Do(func() { + for i := range clientmetrics { + clientmetrics[i].metric = clientmetric.NewGauge(clientmetrics[i].clientmetricName) + } + }) + for i := range samples { + if samples[i].Value.Kind() != metrics.KindUint64 { + continue + } + clientmetrics[i].metric.Set(int64(samples[i].Value.Uint64())) + } +} + +func (p *poller) pollAndEmit() { + samples := make([]metrics.Sample, len(clientmetrics)) + for i := range clientmetrics { + samples[i].Name = clientmetrics[i].sampleName + } + metrics.Read(samples) + exportSamples(samples) +} + +func (p *poller) run() { + ticker := time.NewTicker(pollInterval) + defer ticker.Stop() + p.pollAndEmit() // pollAndEmit immediately + for { + select { + case <-p.closeCh: + return + case <-ticker.C: + p.pollAndEmit() + } + } +} diff --git a/feature/runtimemetrics/runtimemetrics_test.go b/feature/runtimemetrics/runtimemetrics_test.go new file mode 100644 index 000000000..c53b6db08 --- /dev/null +++ b/feature/runtimemetrics/runtimemetrics_test.go @@ -0,0 +1,117 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +package runtimemetrics + +import ( + "testing" + "testing/synctest" + "time" +) + +func TestSetEnabledEndToEnd(t *testing.T) { + synctest.Test(t, syncTestSetEnabledEndToEnd) +} + +func syncTestSetEnabledEndToEnd(t *testing.T) { + getPoller := func() *poller { + setEnabledMu.Lock() + defer setEnabledMu.Unlock() + return runningPoller + } + + if p := getPoller(); p != nil { + t.Fatalf("runningPoller not nil at test start: %p", p) + } + t.Cleanup(func() { setEnabled(false) }) + + // disabled -> enabled: starts a poller that immediately polls. + setEnabled(true) + p1 := getPoller() + if p1 == nil { + t.Fatal("runningPoller nil after setEnabled(true)") + } + + // Wait for the immediate pollAndEmit to finish and the goroutine to + // block on the ticker. + synctest.Wait() + + // Lazy metric registration must have happened and values must be set + // from a real runtime/metrics read. Both currently-tracked metrics + // (heap objects + total memory) are always >0 in a running Go process. + for i, cm := range clientmetrics { + if cm.metric == nil { + t.Fatalf("clientmetrics[%d] (%s) metric nil after first poll", i, cm.sampleName) + } + if got := cm.metric.Value(); got <= 0 { + t.Errorf("clientmetrics[%d] (%s) = %d after first poll, want > 0", + i, cm.clientmetricName, got) + } + } + + // setEnabled(true) while enabled is idempotent: same poller instance. + setEnabled(true) + if p := getPoller(); p != p1 { + t.Fatalf("setEnabled(true) replaced poller: got %p, want %p", p, p1) + } + + // Overwrite each gauge with a sentinel so we can prove the next tick + // re-reads runtime values. + const sentinel = int64(-1) + for _, cm := range clientmetrics { + cm.metric.Set(sentinel) + } + + // Advance virtual time one tick. The poller's ticker fires and pollAndEmit + // runs again. + time.Sleep(pollInterval) + synctest.Wait() + + for i, cm := range clientmetrics { + if got := cm.metric.Value(); got == sentinel { + t.Errorf("clientmetrics[%d] (%s) still sentinel %d after tick; ticker did not fire", + i, cm.clientmetricName, got) + } else if got <= 0 { + t.Errorf("clientmetrics[%d] (%s) = %d after tick, want > 0", + i, cm.clientmetricName, got) + } + } + + // enabled -> disabled: stops the poller; wg.Wait inside close() means + // the goroutine has exited by the time setEnabled returns. + setEnabled(false) + if p := getPoller(); p != nil { + t.Fatalf("runningPoller %p still set after setEnabled(false)", p) + } + + // After disabling, gauges must remain at their last polled values + // indefinitely (no further ticks should overwrite them). + for _, cm := range clientmetrics { + cm.metric.Set(sentinel) + } + time.Sleep(10 * pollInterval) + synctest.Wait() + for i, cm := range clientmetrics { + if got := cm.metric.Value(); got != sentinel { + t.Errorf("clientmetrics[%d] (%s) = %d after disabling; poller goroutine still running?", + i, cm.clientmetricName, got) + } + } + + // disabled -> disabled: still a no-op. + setEnabled(false) + if p := getPoller(); p != nil { + t.Fatalf("runningPoller %p set after second setEnabled(false)", p) + } + + // Re-enable creates a fresh poller, not the closed one. + setEnabled(true) + synctest.Wait() + p2 := getPoller() + if p2 == nil { + t.Fatal("runningPoller nil on re-enable") + } + if p2 == p1 { + t.Fatal("re-enable returned previously-closed poller") + } +} diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 91f8b7f3c..a49e1296a 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -1286,6 +1286,14 @@ func (b *LocalBackend) Shutdown() { if cc != nil { cc.Shutdown() } + if buildfeatures.HasRuntimeMetrics { + // We disable runtime metrics _after_ the control client is shutdown to + // ensure we don't leak the metrics polling goroutine in the case where + // netmap handling races [LocalBackend] shutdown. + if f, ok := HookSetRuntimeMetricsEnabled.GetOk(); ok { + f(false) + } + } b.ctxCancel(errShutdown) b.currentNode().shutdown(errShutdown) b.extHost.Shutdown() @@ -6892,6 +6900,11 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { b.MagicConn().SetSilentDisco(b.ControlKnobs().SilentDisco.Load()) b.MagicConn().SetProbeUDPLifetime(b.ControlKnobs().ProbeUDPLifetime.Load()) + if buildfeatures.HasRuntimeMetrics { + if f, ok := HookSetRuntimeMetricsEnabled.GetOk(); ok { + f(b.ControlKnobs().EmitRuntimeMetrics.Load()) + } + } if buildfeatures.HasDebug { b.setDebugLogsByCapabilityLocked(nm) @@ -6970,6 +6983,9 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { } } +// HookSetRuntimeMetricsEnabled is an optional hook for the "runtimemetrics" feature. +var HookSetRuntimeMetricsEnabled feature.Hook[func(enabled bool)] + var hookSetNetMapLockedDrive feature.Hook[func(*LocalBackend, *netmap.NetworkMap)] // roundTraffic rounds bytes. This is used to preserve user privacy within logs. diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 57c68fad6..0f3be5a01 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -185,7 +185,8 @@ // - 136: 2026-04-09: Client understands [NodeAttrDisableLinuxCGNATDropRule] // - 137: 2026-04-15: Client handles 429 responses to /machine/register. // - 138: 2026-03-31: can handle C2N /debug/tka. -const CurrentCapabilityVersion CapabilityVersion = 138 +// - 139: 2026-05-22: Client understands [NodeAttrEmitRuntimeMetrics] +const CurrentCapabilityVersion CapabilityVersion = 139 // ID is an integer ID for a user, node, or login allocated by the // control plane. @@ -2788,6 +2789,10 @@ func (p NodeCapabilityPrefix) ToAttribute(value string) NodeCapability { // that does not originate from the Tailscale network interface. // This enables access to off-tailnet endpoints within that IP range. NodeAttrDisableLinuxCGNATDropRule NodeCapability = "disable-linux-cgnat-drop-rule" + + // NodeAttrEmitRuntimeMetrics enables emission of [runtime/metrics] as + // [tailscale.com/util/clientmetric]'s. + NodeAttrEmitRuntimeMetrics NodeCapability = "emit-runtime-metrics" ) const (