mirror of
https://github.com/tailscale/tailscale.git
synced 2026-05-29 11:11:31 -04:00
control/controlknobs,feature/*,ipn/ipnlocal,tailcfg: add runtimemetrics
Emit runtime metrics as clientmetrics when the NodeAttrEmitRuntimeMetrics NodeCapability is present. We start small with just 2 metrics: heap bytes and total process memory. Updates tailscale/corp#39434 Signed-off-by: Jordan Whited <jordan@tailscale.com>
This commit is contained in:
committed by
Jordan Whited
parent
2eb45c2457
commit
e5a8cf3b18
@@ -304,6 +304,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
|
||||
tailscale.com/feature/posture from tailscale.com/feature/condregister
|
||||
tailscale.com/feature/relayserver from tailscale.com/feature/condregister
|
||||
tailscale.com/feature/routecheck from tailscale.com/feature/condregister
|
||||
tailscale.com/feature/runtimemetrics from tailscale.com/feature/condregister
|
||||
L tailscale.com/feature/sdnotify from tailscale.com/feature/condregister
|
||||
LD tailscale.com/feature/ssh from tailscale.com/cmd/tailscaled
|
||||
tailscale.com/feature/syspolicy from tailscale.com/feature/condregister+
|
||||
@@ -756,6 +757,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
|
||||
regexp/syntax from regexp
|
||||
runtime from archive/tar+
|
||||
runtime/debug from github.com/aws/aws-sdk-go-v2/internal/sync/singleflight+
|
||||
runtime/metrics from tailscale.com/feature/runtimemetrics
|
||||
runtime/pprof from net/http/pprof+
|
||||
runtime/trace from net/http/pprof
|
||||
slices from tailscale.com/appc+
|
||||
|
||||
@@ -110,6 +110,10 @@ type Knobs struct {
|
||||
// See https://github.com/tailscale/tailscale/issues/15404.
|
||||
// TODO(bradfitz): remove this a few releases after 2026-02-16.
|
||||
ForceRegisterMagicDNSIPv4Only atomic.Bool
|
||||
|
||||
// EmitRuntimeMetrics is whether the node should poll and emit [runtime/metrics]
|
||||
// as [tailscale.com/util/clientmetric]'s.
|
||||
EmitRuntimeMetrics atomic.Bool
|
||||
}
|
||||
|
||||
// UpdateFromNodeAttributes updates k (if non-nil) based on the provided self
|
||||
@@ -139,6 +143,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) {
|
||||
disableSkipStatusQueue = has(tailcfg.NodeAttrDisableSkipStatusQueue)
|
||||
disableHostsFileUpdates = has(tailcfg.NodeAttrDisableHostsFileUpdates)
|
||||
forceRegisterMagicDNSIPv4Only = has(tailcfg.NodeAttrForceRegisterMagicDNSIPv4Only)
|
||||
emitRuntimeMetrics = has(tailcfg.NodeAttrEmitRuntimeMetrics)
|
||||
)
|
||||
|
||||
if has(tailcfg.NodeAttrOneCGNATEnable) {
|
||||
@@ -166,6 +171,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) {
|
||||
k.DisableSkipStatusQueue.Store(disableSkipStatusQueue)
|
||||
k.DisableHostsFileUpdates.Store(disableHostsFileUpdates)
|
||||
k.ForceRegisterMagicDNSIPv4Only.Store(forceRegisterMagicDNSIPv4Only)
|
||||
k.EmitRuntimeMetrics.Store(emitRuntimeMetrics)
|
||||
}
|
||||
|
||||
// AsDebugJSON returns k as something that can be marshalled with json.Marshal
|
||||
|
||||
13
feature/buildfeatures/feature_runtimemetrics_disabled.go
Normal file
13
feature/buildfeatures/feature_runtimemetrics_disabled.go
Normal file
@@ -0,0 +1,13 @@
|
||||
// Copyright (c) Tailscale Inc & contributors
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
// Code generated by gen.go; DO NOT EDIT.
|
||||
|
||||
//go:build ts_omit_runtimemetrics
|
||||
|
||||
package buildfeatures
|
||||
|
||||
// HasRuntimeMetrics is whether the binary was built with support for modular feature "Support emission of runtime/metrics as clientmetrics".
|
||||
// Specifically, it's whether the binary was NOT built with the "ts_omit_runtimemetrics" build tag.
|
||||
// It's a const so it can be used for dead code elimination.
|
||||
const HasRuntimeMetrics = false
|
||||
13
feature/buildfeatures/feature_runtimemetrics_enabled.go
Normal file
13
feature/buildfeatures/feature_runtimemetrics_enabled.go
Normal file
@@ -0,0 +1,13 @@
|
||||
// Copyright (c) Tailscale Inc & contributors
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
// Code generated by gen.go; DO NOT EDIT.
|
||||
|
||||
//go:build !ts_omit_runtimemetrics
|
||||
|
||||
package buildfeatures
|
||||
|
||||
// HasRuntimeMetrics is whether the binary was built with support for modular feature "Support emission of runtime/metrics as clientmetrics".
|
||||
// Specifically, it's whether the binary was NOT built with the "ts_omit_runtimemetrics" build tag.
|
||||
// It's a const so it can be used for dead code elimination.
|
||||
const HasRuntimeMetrics = true
|
||||
8
feature/condregister/maybe_runtimemetrics.go
Normal file
8
feature/condregister/maybe_runtimemetrics.go
Normal file
@@ -0,0 +1,8 @@
|
||||
// Copyright (c) Tailscale Inc & contributors
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
//go:build !ts_omit_runtimemetrics
|
||||
|
||||
package condregister
|
||||
|
||||
import _ "tailscale.com/feature/runtimemetrics"
|
||||
@@ -239,6 +239,10 @@ type FeatureMeta struct {
|
||||
Sym: "RouteCheck",
|
||||
Desc: "Support checking the reachability of overlapping routers, for choosing between multiple network paths to the same IP address",
|
||||
},
|
||||
"runtimemetrics": {
|
||||
Sym: "RuntimeMetrics",
|
||||
Desc: "Support emission of runtime/metrics as clientmetrics",
|
||||
},
|
||||
"sdnotify": {
|
||||
Sym: "SDNotify",
|
||||
Desc: "systemd notification support",
|
||||
|
||||
132
feature/runtimemetrics/runtimemetrics.go
Normal file
132
feature/runtimemetrics/runtimemetrics.go
Normal file
@@ -0,0 +1,132 @@
|
||||
// Copyright (c) Tailscale Inc & contributors
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
// Package runtimemetrics exports select runtime/metrics as [tailscale.com/util/clientmetric]'s.
|
||||
package runtimemetrics
|
||||
|
||||
import (
|
||||
"runtime/metrics"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"tailscale.com/feature"
|
||||
"tailscale.com/ipn/ipnlocal"
|
||||
"tailscale.com/util/clientmetric"
|
||||
)
|
||||
|
||||
func init() {
|
||||
feature.Register("runtimemetrics")
|
||||
ipnlocal.HookSetRuntimeMetricsEnabled.Set(setEnabled)
|
||||
}
|
||||
|
||||
var (
|
||||
setEnabledMu sync.Mutex // guards runningPoller
|
||||
runningPoller *poller // non-nil when running, otherwise nil
|
||||
)
|
||||
|
||||
func setEnabled(enabled bool) {
|
||||
setEnabledMu.Lock()
|
||||
defer setEnabledMu.Unlock()
|
||||
if enabled {
|
||||
if runningPoller != nil {
|
||||
return
|
||||
}
|
||||
runningPoller = newPoller()
|
||||
} else {
|
||||
if runningPoller == nil {
|
||||
return
|
||||
}
|
||||
runningPoller.close()
|
||||
runningPoller = nil
|
||||
}
|
||||
}
|
||||
|
||||
type poller struct {
|
||||
closeOnce sync.Once
|
||||
closeCh chan struct{}
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
func newPoller() *poller {
|
||||
p := &poller{
|
||||
closeCh: make(chan struct{}),
|
||||
}
|
||||
p.wg.Go(p.run)
|
||||
return p
|
||||
}
|
||||
|
||||
func (p *poller) close() {
|
||||
p.closeOnce.Do(func() {
|
||||
close(p.closeCh)
|
||||
p.wg.Wait()
|
||||
})
|
||||
}
|
||||
|
||||
const (
|
||||
// pollInterval is how frequently [poller] polls Go runtime metrics. Its
|
||||
// value mirrors [tailscale.com/util/clientmetric.minMetricEncodeInterval],
|
||||
// which is the minimum interval between clientmetrics emissions.
|
||||
pollInterval = 15 * time.Second
|
||||
)
|
||||
|
||||
type sampleNameClientmetric struct {
|
||||
sampleName string // [metrics.Sample.Name]
|
||||
clientmetricName string // passed to clientmetric.New...
|
||||
metric *clientmetric.Metric // lazy init on first pollAndEmit
|
||||
}
|
||||
|
||||
var clientmetrics = []sampleNameClientmetric{
|
||||
{
|
||||
// Memory occupied by live objects and dead objects that have not
|
||||
// yet been marked free by the garbage collector.
|
||||
sampleName: "/memory/classes/heap/objects:bytes",
|
||||
clientmetricName: "runtimemetrics_memory_heap_objects_bytes",
|
||||
},
|
||||
{
|
||||
// All memory mapped by the Go runtime into the current process
|
||||
// as read-write. Note that this does not include memory mapped
|
||||
// by code called via cgo or via the syscall package. Sum of all
|
||||
// metrics in /memory/classes.
|
||||
sampleName: "/memory/classes/total:bytes",
|
||||
clientmetricName: "runtimemetrics_memory_total_bytes",
|
||||
},
|
||||
}
|
||||
|
||||
var registerClientmetricsOnce sync.Once
|
||||
|
||||
func exportSamples(samples []metrics.Sample) {
|
||||
registerClientmetricsOnce.Do(func() {
|
||||
for i := range clientmetrics {
|
||||
clientmetrics[i].metric = clientmetric.NewGauge(clientmetrics[i].clientmetricName)
|
||||
}
|
||||
})
|
||||
for i := range samples {
|
||||
if samples[i].Value.Kind() != metrics.KindUint64 {
|
||||
continue
|
||||
}
|
||||
clientmetrics[i].metric.Set(int64(samples[i].Value.Uint64()))
|
||||
}
|
||||
}
|
||||
|
||||
func (p *poller) pollAndEmit() {
|
||||
samples := make([]metrics.Sample, len(clientmetrics))
|
||||
for i := range clientmetrics {
|
||||
samples[i].Name = clientmetrics[i].sampleName
|
||||
}
|
||||
metrics.Read(samples)
|
||||
exportSamples(samples)
|
||||
}
|
||||
|
||||
func (p *poller) run() {
|
||||
ticker := time.NewTicker(pollInterval)
|
||||
defer ticker.Stop()
|
||||
p.pollAndEmit() // pollAndEmit immediately
|
||||
for {
|
||||
select {
|
||||
case <-p.closeCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
p.pollAndEmit()
|
||||
}
|
||||
}
|
||||
}
|
||||
117
feature/runtimemetrics/runtimemetrics_test.go
Normal file
117
feature/runtimemetrics/runtimemetrics_test.go
Normal file
@@ -0,0 +1,117 @@
|
||||
// Copyright (c) Tailscale Inc & contributors
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
package runtimemetrics
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"testing/synctest"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestSetEnabledEndToEnd(t *testing.T) {
|
||||
synctest.Test(t, syncTestSetEnabledEndToEnd)
|
||||
}
|
||||
|
||||
func syncTestSetEnabledEndToEnd(t *testing.T) {
|
||||
getPoller := func() *poller {
|
||||
setEnabledMu.Lock()
|
||||
defer setEnabledMu.Unlock()
|
||||
return runningPoller
|
||||
}
|
||||
|
||||
if p := getPoller(); p != nil {
|
||||
t.Fatalf("runningPoller not nil at test start: %p", p)
|
||||
}
|
||||
t.Cleanup(func() { setEnabled(false) })
|
||||
|
||||
// disabled -> enabled: starts a poller that immediately polls.
|
||||
setEnabled(true)
|
||||
p1 := getPoller()
|
||||
if p1 == nil {
|
||||
t.Fatal("runningPoller nil after setEnabled(true)")
|
||||
}
|
||||
|
||||
// Wait for the immediate pollAndEmit to finish and the goroutine to
|
||||
// block on the ticker.
|
||||
synctest.Wait()
|
||||
|
||||
// Lazy metric registration must have happened and values must be set
|
||||
// from a real runtime/metrics read. Both currently-tracked metrics
|
||||
// (heap objects + total memory) are always >0 in a running Go process.
|
||||
for i, cm := range clientmetrics {
|
||||
if cm.metric == nil {
|
||||
t.Fatalf("clientmetrics[%d] (%s) metric nil after first poll", i, cm.sampleName)
|
||||
}
|
||||
if got := cm.metric.Value(); got <= 0 {
|
||||
t.Errorf("clientmetrics[%d] (%s) = %d after first poll, want > 0",
|
||||
i, cm.clientmetricName, got)
|
||||
}
|
||||
}
|
||||
|
||||
// setEnabled(true) while enabled is idempotent: same poller instance.
|
||||
setEnabled(true)
|
||||
if p := getPoller(); p != p1 {
|
||||
t.Fatalf("setEnabled(true) replaced poller: got %p, want %p", p, p1)
|
||||
}
|
||||
|
||||
// Overwrite each gauge with a sentinel so we can prove the next tick
|
||||
// re-reads runtime values.
|
||||
const sentinel = int64(-1)
|
||||
for _, cm := range clientmetrics {
|
||||
cm.metric.Set(sentinel)
|
||||
}
|
||||
|
||||
// Advance virtual time one tick. The poller's ticker fires and pollAndEmit
|
||||
// runs again.
|
||||
time.Sleep(pollInterval)
|
||||
synctest.Wait()
|
||||
|
||||
for i, cm := range clientmetrics {
|
||||
if got := cm.metric.Value(); got == sentinel {
|
||||
t.Errorf("clientmetrics[%d] (%s) still sentinel %d after tick; ticker did not fire",
|
||||
i, cm.clientmetricName, got)
|
||||
} else if got <= 0 {
|
||||
t.Errorf("clientmetrics[%d] (%s) = %d after tick, want > 0",
|
||||
i, cm.clientmetricName, got)
|
||||
}
|
||||
}
|
||||
|
||||
// enabled -> disabled: stops the poller; wg.Wait inside close() means
|
||||
// the goroutine has exited by the time setEnabled returns.
|
||||
setEnabled(false)
|
||||
if p := getPoller(); p != nil {
|
||||
t.Fatalf("runningPoller %p still set after setEnabled(false)", p)
|
||||
}
|
||||
|
||||
// After disabling, gauges must remain at their last polled values
|
||||
// indefinitely (no further ticks should overwrite them).
|
||||
for _, cm := range clientmetrics {
|
||||
cm.metric.Set(sentinel)
|
||||
}
|
||||
time.Sleep(10 * pollInterval)
|
||||
synctest.Wait()
|
||||
for i, cm := range clientmetrics {
|
||||
if got := cm.metric.Value(); got != sentinel {
|
||||
t.Errorf("clientmetrics[%d] (%s) = %d after disabling; poller goroutine still running?",
|
||||
i, cm.clientmetricName, got)
|
||||
}
|
||||
}
|
||||
|
||||
// disabled -> disabled: still a no-op.
|
||||
setEnabled(false)
|
||||
if p := getPoller(); p != nil {
|
||||
t.Fatalf("runningPoller %p set after second setEnabled(false)", p)
|
||||
}
|
||||
|
||||
// Re-enable creates a fresh poller, not the closed one.
|
||||
setEnabled(true)
|
||||
synctest.Wait()
|
||||
p2 := getPoller()
|
||||
if p2 == nil {
|
||||
t.Fatal("runningPoller nil on re-enable")
|
||||
}
|
||||
if p2 == p1 {
|
||||
t.Fatal("re-enable returned previously-closed poller")
|
||||
}
|
||||
}
|
||||
@@ -1286,6 +1286,14 @@ func (b *LocalBackend) Shutdown() {
|
||||
if cc != nil {
|
||||
cc.Shutdown()
|
||||
}
|
||||
if buildfeatures.HasRuntimeMetrics {
|
||||
// We disable runtime metrics _after_ the control client is shutdown to
|
||||
// ensure we don't leak the metrics polling goroutine in the case where
|
||||
// netmap handling races [LocalBackend] shutdown.
|
||||
if f, ok := HookSetRuntimeMetricsEnabled.GetOk(); ok {
|
||||
f(false)
|
||||
}
|
||||
}
|
||||
b.ctxCancel(errShutdown)
|
||||
b.currentNode().shutdown(errShutdown)
|
||||
b.extHost.Shutdown()
|
||||
@@ -6892,6 +6900,11 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
|
||||
|
||||
b.MagicConn().SetSilentDisco(b.ControlKnobs().SilentDisco.Load())
|
||||
b.MagicConn().SetProbeUDPLifetime(b.ControlKnobs().ProbeUDPLifetime.Load())
|
||||
if buildfeatures.HasRuntimeMetrics {
|
||||
if f, ok := HookSetRuntimeMetricsEnabled.GetOk(); ok {
|
||||
f(b.ControlKnobs().EmitRuntimeMetrics.Load())
|
||||
}
|
||||
}
|
||||
|
||||
if buildfeatures.HasDebug {
|
||||
b.setDebugLogsByCapabilityLocked(nm)
|
||||
@@ -6970,6 +6983,9 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
|
||||
}
|
||||
}
|
||||
|
||||
// HookSetRuntimeMetricsEnabled is an optional hook for the "runtimemetrics" feature.
|
||||
var HookSetRuntimeMetricsEnabled feature.Hook[func(enabled bool)]
|
||||
|
||||
var hookSetNetMapLockedDrive feature.Hook[func(*LocalBackend, *netmap.NetworkMap)]
|
||||
|
||||
// roundTraffic rounds bytes. This is used to preserve user privacy within logs.
|
||||
|
||||
@@ -185,7 +185,8 @@
|
||||
// - 136: 2026-04-09: Client understands [NodeAttrDisableLinuxCGNATDropRule]
|
||||
// - 137: 2026-04-15: Client handles 429 responses to /machine/register.
|
||||
// - 138: 2026-03-31: can handle C2N /debug/tka.
|
||||
const CurrentCapabilityVersion CapabilityVersion = 138
|
||||
// - 139: 2026-05-22: Client understands [NodeAttrEmitRuntimeMetrics]
|
||||
const CurrentCapabilityVersion CapabilityVersion = 139
|
||||
|
||||
// ID is an integer ID for a user, node, or login allocated by the
|
||||
// control plane.
|
||||
@@ -2788,6 +2789,10 @@ func (p NodeCapabilityPrefix) ToAttribute(value string) NodeCapability {
|
||||
// that does not originate from the Tailscale network interface.
|
||||
// This enables access to off-tailnet endpoints within that IP range.
|
||||
NodeAttrDisableLinuxCGNATDropRule NodeCapability = "disable-linux-cgnat-drop-rule"
|
||||
|
||||
// NodeAttrEmitRuntimeMetrics enables emission of [runtime/metrics] as
|
||||
// [tailscale.com/util/clientmetric]'s.
|
||||
NodeAttrEmitRuntimeMetrics NodeCapability = "emit-runtime-metrics"
|
||||
)
|
||||
|
||||
const (
|
||||
|
||||
Reference in New Issue
Block a user