control/controlknobs,feature/*,ipn/ipnlocal,tailcfg: add runtimemetrics

Emit runtime metrics as clientmetrics when the
NodeAttrEmitRuntimeMetrics NodeCapability is present.

We start small with just 2 metrics: heap bytes and total process memory.

Updates tailscale/corp#39434

Signed-off-by: Jordan Whited <jordan@tailscale.com>
This commit is contained in:
Jordan Whited
2026-05-22 14:19:42 -07:00
committed by Jordan Whited
parent 2eb45c2457
commit e5a8cf3b18
10 changed files with 317 additions and 1 deletions

View File

@@ -304,6 +304,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
tailscale.com/feature/posture from tailscale.com/feature/condregister
tailscale.com/feature/relayserver from tailscale.com/feature/condregister
tailscale.com/feature/routecheck from tailscale.com/feature/condregister
tailscale.com/feature/runtimemetrics from tailscale.com/feature/condregister
L tailscale.com/feature/sdnotify from tailscale.com/feature/condregister
LD tailscale.com/feature/ssh from tailscale.com/cmd/tailscaled
tailscale.com/feature/syspolicy from tailscale.com/feature/condregister+
@@ -756,6 +757,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
regexp/syntax from regexp
runtime from archive/tar+
runtime/debug from github.com/aws/aws-sdk-go-v2/internal/sync/singleflight+
runtime/metrics from tailscale.com/feature/runtimemetrics
runtime/pprof from net/http/pprof+
runtime/trace from net/http/pprof
slices from tailscale.com/appc+

View File

@@ -110,6 +110,10 @@ type Knobs struct {
// See https://github.com/tailscale/tailscale/issues/15404.
// TODO(bradfitz): remove this a few releases after 2026-02-16.
ForceRegisterMagicDNSIPv4Only atomic.Bool
// EmitRuntimeMetrics is whether the node should poll and emit [runtime/metrics]
// as [tailscale.com/util/clientmetric]'s.
EmitRuntimeMetrics atomic.Bool
}
// UpdateFromNodeAttributes updates k (if non-nil) based on the provided self
@@ -139,6 +143,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) {
disableSkipStatusQueue = has(tailcfg.NodeAttrDisableSkipStatusQueue)
disableHostsFileUpdates = has(tailcfg.NodeAttrDisableHostsFileUpdates)
forceRegisterMagicDNSIPv4Only = has(tailcfg.NodeAttrForceRegisterMagicDNSIPv4Only)
emitRuntimeMetrics = has(tailcfg.NodeAttrEmitRuntimeMetrics)
)
if has(tailcfg.NodeAttrOneCGNATEnable) {
@@ -166,6 +171,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) {
k.DisableSkipStatusQueue.Store(disableSkipStatusQueue)
k.DisableHostsFileUpdates.Store(disableHostsFileUpdates)
k.ForceRegisterMagicDNSIPv4Only.Store(forceRegisterMagicDNSIPv4Only)
k.EmitRuntimeMetrics.Store(emitRuntimeMetrics)
}
// AsDebugJSON returns k as something that can be marshalled with json.Marshal

View File

@@ -0,0 +1,13 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
// Code generated by gen.go; DO NOT EDIT.
//go:build ts_omit_runtimemetrics
package buildfeatures
// HasRuntimeMetrics is whether the binary was built with support for modular feature "Support emission of runtime/metrics as clientmetrics".
// Specifically, it's whether the binary was NOT built with the "ts_omit_runtimemetrics" build tag.
// It's a const so it can be used for dead code elimination.
const HasRuntimeMetrics = false

View File

@@ -0,0 +1,13 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
// Code generated by gen.go; DO NOT EDIT.
//go:build !ts_omit_runtimemetrics
package buildfeatures
// HasRuntimeMetrics is whether the binary was built with support for modular feature "Support emission of runtime/metrics as clientmetrics".
// Specifically, it's whether the binary was NOT built with the "ts_omit_runtimemetrics" build tag.
// It's a const so it can be used for dead code elimination.
const HasRuntimeMetrics = true

View File

@@ -0,0 +1,8 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
//go:build !ts_omit_runtimemetrics
package condregister
import _ "tailscale.com/feature/runtimemetrics"

View File

@@ -239,6 +239,10 @@ type FeatureMeta struct {
Sym: "RouteCheck",
Desc: "Support checking the reachability of overlapping routers, for choosing between multiple network paths to the same IP address",
},
"runtimemetrics": {
Sym: "RuntimeMetrics",
Desc: "Support emission of runtime/metrics as clientmetrics",
},
"sdnotify": {
Sym: "SDNotify",
Desc: "systemd notification support",

View File

@@ -0,0 +1,132 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
// Package runtimemetrics exports select runtime/metrics as [tailscale.com/util/clientmetric]'s.
package runtimemetrics
import (
"runtime/metrics"
"sync"
"time"
"tailscale.com/feature"
"tailscale.com/ipn/ipnlocal"
"tailscale.com/util/clientmetric"
)
func init() {
feature.Register("runtimemetrics")
ipnlocal.HookSetRuntimeMetricsEnabled.Set(setEnabled)
}
var (
setEnabledMu sync.Mutex // guards runningPoller
runningPoller *poller // non-nil when running, otherwise nil
)
func setEnabled(enabled bool) {
setEnabledMu.Lock()
defer setEnabledMu.Unlock()
if enabled {
if runningPoller != nil {
return
}
runningPoller = newPoller()
} else {
if runningPoller == nil {
return
}
runningPoller.close()
runningPoller = nil
}
}
type poller struct {
closeOnce sync.Once
closeCh chan struct{}
wg sync.WaitGroup
}
func newPoller() *poller {
p := &poller{
closeCh: make(chan struct{}),
}
p.wg.Go(p.run)
return p
}
func (p *poller) close() {
p.closeOnce.Do(func() {
close(p.closeCh)
p.wg.Wait()
})
}
const (
// pollInterval is how frequently [poller] polls Go runtime metrics. Its
// value mirrors [tailscale.com/util/clientmetric.minMetricEncodeInterval],
// which is the minimum interval between clientmetrics emissions.
pollInterval = 15 * time.Second
)
type sampleNameClientmetric struct {
sampleName string // [metrics.Sample.Name]
clientmetricName string // passed to clientmetric.New...
metric *clientmetric.Metric // lazy init on first pollAndEmit
}
var clientmetrics = []sampleNameClientmetric{
{
// Memory occupied by live objects and dead objects that have not
// yet been marked free by the garbage collector.
sampleName: "/memory/classes/heap/objects:bytes",
clientmetricName: "runtimemetrics_memory_heap_objects_bytes",
},
{
// All memory mapped by the Go runtime into the current process
// as read-write. Note that this does not include memory mapped
// by code called via cgo or via the syscall package. Sum of all
// metrics in /memory/classes.
sampleName: "/memory/classes/total:bytes",
clientmetricName: "runtimemetrics_memory_total_bytes",
},
}
var registerClientmetricsOnce sync.Once
func exportSamples(samples []metrics.Sample) {
registerClientmetricsOnce.Do(func() {
for i := range clientmetrics {
clientmetrics[i].metric = clientmetric.NewGauge(clientmetrics[i].clientmetricName)
}
})
for i := range samples {
if samples[i].Value.Kind() != metrics.KindUint64 {
continue
}
clientmetrics[i].metric.Set(int64(samples[i].Value.Uint64()))
}
}
func (p *poller) pollAndEmit() {
samples := make([]metrics.Sample, len(clientmetrics))
for i := range clientmetrics {
samples[i].Name = clientmetrics[i].sampleName
}
metrics.Read(samples)
exportSamples(samples)
}
func (p *poller) run() {
ticker := time.NewTicker(pollInterval)
defer ticker.Stop()
p.pollAndEmit() // pollAndEmit immediately
for {
select {
case <-p.closeCh:
return
case <-ticker.C:
p.pollAndEmit()
}
}
}

View File

@@ -0,0 +1,117 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package runtimemetrics
import (
"testing"
"testing/synctest"
"time"
)
func TestSetEnabledEndToEnd(t *testing.T) {
synctest.Test(t, syncTestSetEnabledEndToEnd)
}
func syncTestSetEnabledEndToEnd(t *testing.T) {
getPoller := func() *poller {
setEnabledMu.Lock()
defer setEnabledMu.Unlock()
return runningPoller
}
if p := getPoller(); p != nil {
t.Fatalf("runningPoller not nil at test start: %p", p)
}
t.Cleanup(func() { setEnabled(false) })
// disabled -> enabled: starts a poller that immediately polls.
setEnabled(true)
p1 := getPoller()
if p1 == nil {
t.Fatal("runningPoller nil after setEnabled(true)")
}
// Wait for the immediate pollAndEmit to finish and the goroutine to
// block on the ticker.
synctest.Wait()
// Lazy metric registration must have happened and values must be set
// from a real runtime/metrics read. Both currently-tracked metrics
// (heap objects + total memory) are always >0 in a running Go process.
for i, cm := range clientmetrics {
if cm.metric == nil {
t.Fatalf("clientmetrics[%d] (%s) metric nil after first poll", i, cm.sampleName)
}
if got := cm.metric.Value(); got <= 0 {
t.Errorf("clientmetrics[%d] (%s) = %d after first poll, want > 0",
i, cm.clientmetricName, got)
}
}
// setEnabled(true) while enabled is idempotent: same poller instance.
setEnabled(true)
if p := getPoller(); p != p1 {
t.Fatalf("setEnabled(true) replaced poller: got %p, want %p", p, p1)
}
// Overwrite each gauge with a sentinel so we can prove the next tick
// re-reads runtime values.
const sentinel = int64(-1)
for _, cm := range clientmetrics {
cm.metric.Set(sentinel)
}
// Advance virtual time one tick. The poller's ticker fires and pollAndEmit
// runs again.
time.Sleep(pollInterval)
synctest.Wait()
for i, cm := range clientmetrics {
if got := cm.metric.Value(); got == sentinel {
t.Errorf("clientmetrics[%d] (%s) still sentinel %d after tick; ticker did not fire",
i, cm.clientmetricName, got)
} else if got <= 0 {
t.Errorf("clientmetrics[%d] (%s) = %d after tick, want > 0",
i, cm.clientmetricName, got)
}
}
// enabled -> disabled: stops the poller; wg.Wait inside close() means
// the goroutine has exited by the time setEnabled returns.
setEnabled(false)
if p := getPoller(); p != nil {
t.Fatalf("runningPoller %p still set after setEnabled(false)", p)
}
// After disabling, gauges must remain at their last polled values
// indefinitely (no further ticks should overwrite them).
for _, cm := range clientmetrics {
cm.metric.Set(sentinel)
}
time.Sleep(10 * pollInterval)
synctest.Wait()
for i, cm := range clientmetrics {
if got := cm.metric.Value(); got != sentinel {
t.Errorf("clientmetrics[%d] (%s) = %d after disabling; poller goroutine still running?",
i, cm.clientmetricName, got)
}
}
// disabled -> disabled: still a no-op.
setEnabled(false)
if p := getPoller(); p != nil {
t.Fatalf("runningPoller %p set after second setEnabled(false)", p)
}
// Re-enable creates a fresh poller, not the closed one.
setEnabled(true)
synctest.Wait()
p2 := getPoller()
if p2 == nil {
t.Fatal("runningPoller nil on re-enable")
}
if p2 == p1 {
t.Fatal("re-enable returned previously-closed poller")
}
}

View File

@@ -1286,6 +1286,14 @@ func (b *LocalBackend) Shutdown() {
if cc != nil {
cc.Shutdown()
}
if buildfeatures.HasRuntimeMetrics {
// We disable runtime metrics _after_ the control client is shutdown to
// ensure we don't leak the metrics polling goroutine in the case where
// netmap handling races [LocalBackend] shutdown.
if f, ok := HookSetRuntimeMetricsEnabled.GetOk(); ok {
f(false)
}
}
b.ctxCancel(errShutdown)
b.currentNode().shutdown(errShutdown)
b.extHost.Shutdown()
@@ -6892,6 +6900,11 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
b.MagicConn().SetSilentDisco(b.ControlKnobs().SilentDisco.Load())
b.MagicConn().SetProbeUDPLifetime(b.ControlKnobs().ProbeUDPLifetime.Load())
if buildfeatures.HasRuntimeMetrics {
if f, ok := HookSetRuntimeMetricsEnabled.GetOk(); ok {
f(b.ControlKnobs().EmitRuntimeMetrics.Load())
}
}
if buildfeatures.HasDebug {
b.setDebugLogsByCapabilityLocked(nm)
@@ -6970,6 +6983,9 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
}
}
// HookSetRuntimeMetricsEnabled is an optional hook for the "runtimemetrics" feature.
var HookSetRuntimeMetricsEnabled feature.Hook[func(enabled bool)]
var hookSetNetMapLockedDrive feature.Hook[func(*LocalBackend, *netmap.NetworkMap)]
// roundTraffic rounds bytes. This is used to preserve user privacy within logs.

View File

@@ -185,7 +185,8 @@
// - 136: 2026-04-09: Client understands [NodeAttrDisableLinuxCGNATDropRule]
// - 137: 2026-04-15: Client handles 429 responses to /machine/register.
// - 138: 2026-03-31: can handle C2N /debug/tka.
const CurrentCapabilityVersion CapabilityVersion = 138
// - 139: 2026-05-22: Client understands [NodeAttrEmitRuntimeMetrics]
const CurrentCapabilityVersion CapabilityVersion = 139
// ID is an integer ID for a user, node, or login allocated by the
// control plane.
@@ -2788,6 +2789,10 @@ func (p NodeCapabilityPrefix) ToAttribute(value string) NodeCapability {
// that does not originate from the Tailscale network interface.
// This enables access to off-tailnet endpoints within that IP range.
NodeAttrDisableLinuxCGNATDropRule NodeCapability = "disable-linux-cgnat-drop-rule"
// NodeAttrEmitRuntimeMetrics enables emission of [runtime/metrics] as
// [tailscale.com/util/clientmetric]'s.
NodeAttrEmitRuntimeMetrics NodeCapability = "emit-runtime-metrics"
)
const (