ipn/ipnlocal,net/netmon: make frequent darkwake more efficient

Investigating battery costs on a busy tailnet I noticed a large number
of nodes regularly reconnecting to control and DERP. In one case I was
able to analyze closely `pmset` reported the every-minute wake-ups being
triggered by bluetooth. The node was by side effect reconnecting to
control constantly, and this was at times visible to peers as well.

Three changes here improve the situation:
- Short time jumps (less than 10 minutes) no longer produce "major
  network change" events, and so do not trigger full rebind/reconnect.
- Many "incidental" fields on interfaces are ignored, like MTU, flags
  and so on - if the route is still good, the rest should be manageable.
- Additional log output will provide more detail about the cause of
  major network change events.

Updates #3363

Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
James Tucker
2026-04-02 17:02:03 -07:00
parent eaa5d9df4b
commit 304323708c
5 changed files with 410 additions and 35 deletions

View File

@@ -582,7 +582,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
// Call our linkChange code once with the current state.
// Following changes are triggered via the eventbus.
cd, err := netmon.NewChangeDelta(nil, b.interfaceState, false, false)
cd, err := netmon.NewChangeDelta(nil, b.interfaceState, 0, false)
if err != nil {
b.logf("[unexpected] setting initial netmon state failed: %v", err)
} else {

View File

@@ -64,7 +64,7 @@ func syncTestLinkChangeLogLimiter(t *testing.T) {
// InjectEvent doesn't work because it's not a major event, so we
// instead inject the event ourselves.
injector := eventbustest.NewInjector(t, bus)
cd, err := NewChangeDelta(nil, &State{}, true, true)
cd, err := NewChangeDelta(nil, &State{}, 0, true)
if err != nil {
t.Fatal(err)
}

View File

@@ -13,6 +13,7 @@
"net/netip"
"runtime"
"slices"
"strings"
"sync"
"time"
@@ -31,6 +32,15 @@
// us check the wall time sooner than this.
const pollWallTimeInterval = 15 * time.Second
// majorTimeJumpThreshold is the minimum sleep duration that warrants
// treating a time jump as a major event requiring socket rebinding,
// even if the interface state appears unchanged. After a long sleep,
// NAT mappings are likely stale and DHCP leases may have expired
// (the renewal happens after wake, so local state may not yet reflect it).
// Short sleeps (e.g., macOS DarkWake maintenance cycles of ~55s) should
// not trigger rebinding if the network state is unchanged.
const majorTimeJumpThreshold = 10 * time.Minute
// message represents a message returned from an osMon.
type message interface {
// Ignore is whether we should ignore this message.
@@ -67,18 +77,18 @@ type Monitor struct {
stop chan struct{} // closed on Stop
static bool // static Monitor that doesn't actually monitor
mu syncs.Mutex // guards all following fields
cbs set.HandleSet[ChangeFunc]
ifState *State
gwValid bool // whether gw and gwSelfIP are valid
gw netip.Addr // our gateway's IP
gwSelfIP netip.Addr // our own IP address (that corresponds to gw)
started bool
closed bool
goroutines sync.WaitGroup
wallTimer *time.Timer // nil until Started; re-armed AfterFunc per tick
lastWall time.Time
timeJumped bool // whether we need to send a changed=true after a big time jump
mu syncs.Mutex // guards all following fields
cbs set.HandleSet[ChangeFunc]
ifState *State
gwValid bool // whether gw and gwSelfIP are valid
gw netip.Addr // our gateway's IP
gwSelfIP netip.Addr // our own IP address (that corresponds to gw)
started bool
closed bool
goroutines sync.WaitGroup
wallTimer *time.Timer // nil until Started; re-armed AfterFunc per tick
lastWall time.Time
jumpDuration time.Duration // wall-clock time elapsed during detected time jump; 0 if no time jump observed since reset
}
// ChangeFunc is a callback function registered with Monitor that's called when the
@@ -97,10 +107,12 @@ type ChangeDelta struct {
// It is always non-nil.
new *State
// TimeJumped is whether there was a big jump in wall time since the last
// time we checked. This is a hint that a sleeping device might have
// come out of sleep.
TimeJumped bool
// JumpDuration is non-zero when a wall-clock time jump was detected,
// indicating the machine likely just woke from sleep. It is approximately
// how long the machine was asleep (the wall-clock delta since the last
// check, not an exact sleep measurement). Use TimeJumped() to check
// whether a time jump occurred.
JumpDuration time.Duration
DefaultRouteInterface string
@@ -121,19 +133,28 @@ type ChangeDelta struct {
RebindLikelyRequired bool
}
// TimeJumped reports whether a wall-clock time jump was detected,
// indicating the machine likely just woke from sleep. When true,
// JumpDuration contains the approximate duration.
func (cd *ChangeDelta) TimeJumped() bool {
return cd.JumpDuration > 0
}
// CurrentState returns the current (new) state after the change.
func (cd *ChangeDelta) CurrentState() *State {
return cd.new
}
// NewChangeDelta builds a ChangeDelta and eagerly computes the cached fields.
// jumpDuration, if non-zero, indicates a wall-clock time jump was detected
// (the machine likely woke from sleep) and is the approximate duration of the jump.
// forceViability, if true, forces DefaultInterfaceMaybeViable to be true regardless of the
// actual state of the default interface. This is useful in testing.
func NewChangeDelta(old, new *State, timeJumped bool, forceViability bool) (*ChangeDelta, error) {
func NewChangeDelta(old, new *State, jumpDuration time.Duration, forceViability bool) (*ChangeDelta, error) {
cd := ChangeDelta{
old: old,
new: new,
TimeJumped: timeJumped,
old: old,
new: new,
JumpDuration: jumpDuration,
}
if cd.new == nil {
@@ -165,10 +186,18 @@ func NewChangeDelta(old, new *State, timeJumped bool, forceViability bool) (*Cha
cd.DefaultInterfaceMaybeViable = true
}
// Compute rebind requirement. The default interface needs to be viable and
// Compute rebind requirement. The default interface needs to be viable and
// one of the other conditions needs to be true.
//
// Short time jumps (e.g., macOS DarkWake maintenance cycles of ~55s) are
// excluded — if the network state is unchanged after a brief sleep, there's
// no reason to rebind. However, a major time jump (over majorTimeJumpThreshold)
// warrants a rebind even if the local state looks the same, because NAT
// mappings are likely stale and DHCP leases may have changed (the renewal
// happens after wake, so local state may not yet reflect it).
majorTimeJump := cd.JumpDuration >= majorTimeJumpThreshold
cd.RebindLikelyRequired = (cd.old == nil ||
cd.TimeJumped ||
majorTimeJump ||
cd.DefaultInterfaceChanged ||
cd.InterfaceIPsChanged ||
cd.IsLessExpensive ||
@@ -181,7 +210,39 @@ func NewChangeDelta(old, new *State, timeJumped bool, forceViability bool) (*Cha
// StateDesc returns a description of the old and new states for logging.
func (cd *ChangeDelta) StateDesc() string {
return fmt.Sprintf("old: %v new: %v", cd.old, cd.new)
var sb strings.Builder
fmt.Fprintf(&sb, "old: %v new: %v", cd.old, cd.new)
if cd.old != nil && cd.new != nil {
if diff := cd.old.InterfaceDiff(cd.new); diff != "" {
fmt.Fprintf(&sb, " diff: %s", diff)
}
}
if cd.RebindLikelyRequired {
var reasons []string
if cd.old == nil {
reasons = append(reasons, "initial-state")
}
if cd.TimeJumped() {
reasons = append(reasons, fmt.Sprintf("time-jumped(%v)", cd.JumpDuration.Round(time.Second)))
}
if cd.DefaultInterfaceChanged {
reasons = append(reasons, "default-if-changed")
}
if cd.InterfaceIPsChanged {
reasons = append(reasons, "ips-changed")
}
if cd.IsLessExpensive {
reasons = append(reasons, "less-expensive")
}
if cd.HasPACOrProxyConfigChanged {
reasons = append(reasons, "pac-proxy-changed")
}
if cd.AvailableProtocolsChanged {
reasons = append(reasons, "protocols-changed")
}
fmt.Fprintf(&sb, " rebind-reason=[%s]", strings.Join(reasons, ","))
}
return sb.String()
}
// InterfaceIPDisappeared reports whether the given IP address exists on any interface
@@ -245,7 +306,12 @@ func (cd *ChangeDelta) isInterestingInterfaceChange() bool {
}
newIps = filterRoutableIPs(newIps)
if !oldInterface.Equal(newInterface) || !prefixesEqual(oldIps, newIps) {
// Only consider routable IP changes and up/down state transitions
// as interesting. Transient metadata changes (Flags like FlagRunning,
// MTU, etc.) should not trigger a major link change, as they cause
// false "major" events on macOS and Windows when the OS notifies us
// of interface changes that don't affect connectivity.
if oldInterface.IsUp() != newInterface.IsUp() || !prefixesEqual(oldIps, newIps) {
return true
}
}
@@ -277,8 +343,8 @@ func (cd *ChangeDelta) isInterestingInterfaceChange() bool {
}
oldIps = filterRoutableIPs(oldIps)
// The interface's IPs, Name, MTU, etc have changed. This is definitely interesting.
if !newInterface.Equal(oldInterface) || !prefixesEqual(oldIps, newIps) {
// Only consider routable IP changes and up/down state transitions.
if newInterface.IsUp() != oldInterface.IsUp() || !prefixesEqual(oldIps, newIps) {
return true
}
}
@@ -574,7 +640,8 @@ func (m *Monitor) handlePotentialChange(newState *State, forceCallbacks bool) {
return
}
delta, err := NewChangeDelta(oldState, newState, timeJumped, false)
jumpDuration := m.jumpDuration
delta, err := NewChangeDelta(oldState, newState, jumpDuration, false)
if err != nil {
m.logf("[unexpected] error creating ChangeDelta: %v", err)
return
@@ -587,12 +654,13 @@ func (m *Monitor) handlePotentialChange(newState *State, forceCallbacks bool) {
// See if we have a queued or new time jump signal.
if timeJumped {
m.resetTimeJumpedLocked()
m.logf("time jump detected (slept %v), probably wake from sleep", jumpDuration.Round(time.Second))
}
metricChange.Add(1)
if delta.RebindLikelyRequired {
metricChangeMajor.Add(1)
}
if delta.TimeJumped {
if delta.TimeJumped() {
metricChangeTimeJump.Add(1)
}
m.changed.Publish(*delta)
@@ -654,14 +722,14 @@ func (m *Monitor) checkWallTimeAdvanceLocked() bool {
panic("unreachable") // if callers are correct
}
now := wallTime()
if now.Sub(m.lastWall) > pollWallTimeInterval*3/2 {
m.timeJumped = true // it is reset by debounce.
if elapsed := now.Sub(m.lastWall); elapsed > pollWallTimeInterval*3/2 {
m.jumpDuration = elapsed
}
m.lastWall = now
return m.timeJumped
return m.jumpDuration != 0
}
// resetTimeJumpedLocked consumes the signal set by checkWallTimeAdvanceLocked.
func (m *Monitor) resetTimeJumpedLocked() {
m.timeJumped = false
m.jumpDuration = 0
}

View File

@@ -473,6 +473,98 @@ func TestRebindRequired(t *testing.T) {
},
want: false,
},
{
name: "interface-flags-changed-no-ip-change",
s1: &State{
DefaultRouteInterface: "en0",
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{
Name: "en0",
Flags: net.FlagUp | net.FlagBroadcast | net.FlagMulticast | net.FlagRunning,
}},
},
InterfaceIPs: map[string][]netip.Prefix{
"en0": {netip.MustParsePrefix("10.0.0.12/24")},
},
HaveV4: true,
},
s2: &State{
DefaultRouteInterface: "en0",
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{
Name: "en0",
Flags: net.FlagUp | net.FlagBroadcast | net.FlagMulticast, // FlagRunning removed
}},
},
InterfaceIPs: map[string][]netip.Prefix{
"en0": {netip.MustParsePrefix("10.0.0.12/24")},
},
HaveV4: true,
},
want: false,
},
{
name: "interface-mtu-changed-no-ip-change",
s1: &State{
DefaultRouteInterface: "en0",
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{
Name: "en0",
Flags: net.FlagUp | net.FlagBroadcast | net.FlagMulticast | net.FlagRunning,
MTU: 1500,
}},
},
InterfaceIPs: map[string][]netip.Prefix{
"en0": {netip.MustParsePrefix("10.0.0.12/24")},
},
HaveV4: true,
},
s2: &State{
DefaultRouteInterface: "en0",
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{
Name: "en0",
Flags: net.FlagUp | net.FlagBroadcast | net.FlagMulticast | net.FlagRunning,
MTU: 9000,
}},
},
InterfaceIPs: map[string][]netip.Prefix{
"en0": {netip.MustParsePrefix("10.0.0.12/24")},
},
HaveV4: true,
},
want: false,
},
{
name: "interface-went-down",
s1: &State{
DefaultRouteInterface: "en0",
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{
Name: "en0",
Flags: net.FlagUp | net.FlagBroadcast | net.FlagMulticast | net.FlagRunning,
}},
},
InterfaceIPs: map[string][]netip.Prefix{
"en0": {netip.MustParsePrefix("10.0.0.12/24")},
},
HaveV4: true,
},
s2: &State{
DefaultRouteInterface: "en0",
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{
Name: "en0",
Flags: net.FlagBroadcast | net.FlagMulticast, // FlagUp removed
}},
},
InterfaceIPs: map[string][]netip.Prefix{
"en0": {netip.MustParsePrefix("10.0.0.12/24")},
},
HaveV4: true,
},
want: true,
},
}
withIsInterestingInterface(t, func(ni Interface, pfxs []netip.Prefix) bool {
@@ -498,7 +590,7 @@ func TestRebindRequired(t *testing.T) {
}
SetTailscaleInterfaceProps(tt.tsIfName, 1)
cd, err := NewChangeDelta(tt.s1, tt.s2, false, true)
cd, err := NewChangeDelta(tt.s1, tt.s2, 0, true)
if err != nil {
t.Fatalf("NewChangeDelta error: %v", err)
}
@@ -510,6 +602,71 @@ func TestRebindRequired(t *testing.T) {
}
}
func TestTimeJumpedDoesNotTriggerRebind(t *testing.T) {
s := &State{
DefaultRouteInterface: "en0",
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{
Name: "en0",
Flags: net.FlagUp | net.FlagBroadcast | net.FlagMulticast | net.FlagRunning,
}},
},
InterfaceIPs: map[string][]netip.Prefix{
"en0": {netip.MustParsePrefix("10.0.0.12/24")},
},
HaveV4: true,
}
// A short time jump (e.g., macOS DarkWake maintenance cycle ~55s)
// with unchanged network state should NOT trigger rebind.
cd, err := NewChangeDelta(s, s, 55*time.Second, true)
if err != nil {
t.Fatalf("NewChangeDelta error: %v", err)
}
if cd.RebindLikelyRequired {
t.Error("RebindLikelyRequired = true for short time jump with unchanged state; want false")
}
if !cd.TimeJumped() {
t.Error("TimeJumped = false; want true")
}
// A major time jump (>10m) with unchanged state SHOULD trigger rebind,
// because NAT mappings are likely stale.
cd2, err := NewChangeDelta(s, s, 2*time.Hour, true)
if err != nil {
t.Fatalf("NewChangeDelta error: %v", err)
}
if !cd2.RebindLikelyRequired {
t.Error("RebindLikelyRequired = false for major time jump (2h); want true")
}
// A short time jump with changed state SHOULD trigger rebind.
s2 := &State{
DefaultRouteInterface: "en0",
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{
Name: "en0",
Flags: net.FlagUp | net.FlagBroadcast | net.FlagMulticast | net.FlagRunning,
}},
},
InterfaceIPs: map[string][]netip.Prefix{
"en0": {netip.MustParsePrefix("10.0.0.99/24")}, // IP changed
},
HaveV4: true,
}
saveAndRestoreTailscaleIfaceProps(t)
SetTailscaleInterfaceProps("", 0)
cd3, err := NewChangeDelta(s, s2, 55*time.Second, true)
if err != nil {
t.Fatalf("NewChangeDelta error: %v", err)
}
if !cd3.RebindLikelyRequired {
t.Error("RebindLikelyRequired = false for time jump with changed IP; want true")
}
}
func saveAndRestoreTailscaleIfaceProps(t *testing.T) {
t.Helper()
index, _ := TailscaleInterfaceIndex()
@@ -612,6 +769,71 @@ func TestPrefixesEqual(t *testing.T) {
}
}
func TestInterfaceDiff(t *testing.T) {
tests := []struct {
name string
s1, s2 *State
wantDiff string // substring expected in diff output; "" means no diff
}{
{
name: "equal",
s1: &State{HaveV4: true, DefaultRouteInterface: "en0"},
s2: &State{HaveV4: true, DefaultRouteInterface: "en0"},
wantDiff: "",
},
{
name: "flags-changed",
s1: &State{
DefaultRouteInterface: "en0",
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{
Name: "en0",
Flags: net.FlagUp | net.FlagRunning,
}},
},
},
s2: &State{
DefaultRouteInterface: "en0",
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{
Name: "en0",
Flags: net.FlagUp,
}},
},
},
wantDiff: "flags",
},
{
name: "mtu-changed",
s1: &State{
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{Name: "en0", MTU: 1500}},
},
},
s2: &State{
Interface: map[string]Interface{
"en0": {Interface: &net.Interface{Name: "en0", MTU: 9000}},
},
},
wantDiff: "MTU",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := tt.s1.InterfaceDiff(tt.s2)
if tt.wantDiff == "" {
if got != "" {
t.Errorf("InterfaceDiff = %q; want empty", got)
}
} else {
if !strings.Contains(got, tt.wantDiff) {
t.Errorf("InterfaceDiff = %q; want substring %q", got, tt.wantDiff)
}
}
})
}
}
func TestForeachInterface(t *testing.T) {
tests := []struct {
name string

View File

@@ -443,6 +443,91 @@ func (a Interface) Equal(b Interface) bool {
return true
}
// InterfaceDiff returns a human-readable summary of the differences between s
// and s2 that would cause Equal to return false. It returns "" if the states
// are equal. This is useful for debugging false link change events where the
// State.String() output looks identical but Equal() returns false because it
// checks fields not shown in String() (like interface Flags, MTU, HardwareAddr).
func (s *State) InterfaceDiff(s2 *State) string {
if s == nil && s2 == nil {
return ""
}
if s == nil {
return "old=nil"
}
if s2 == nil {
return "new=nil"
}
var diffs []string
if s.HaveV6 != s2.HaveV6 {
diffs = append(diffs, fmt.Sprintf("HaveV6: %v->%v", s.HaveV6, s2.HaveV6))
}
if s.HaveV4 != s2.HaveV4 {
diffs = append(diffs, fmt.Sprintf("HaveV4: %v->%v", s.HaveV4, s2.HaveV4))
}
if s.IsExpensive != s2.IsExpensive {
diffs = append(diffs, fmt.Sprintf("IsExpensive: %v->%v", s.IsExpensive, s2.IsExpensive))
}
if s.DefaultRouteInterface != s2.DefaultRouteInterface {
diffs = append(diffs, fmt.Sprintf("DefaultRoute: %q->%q", s.DefaultRouteInterface, s2.DefaultRouteInterface))
}
if s.HTTPProxy != s2.HTTPProxy {
diffs = append(diffs, fmt.Sprintf("HTTPProxy: %q->%q", s.HTTPProxy, s2.HTTPProxy))
}
if s.PAC != s2.PAC {
diffs = append(diffs, fmt.Sprintf("PAC: %q->%q", s.PAC, s2.PAC))
}
if len(s.Interface) != len(s2.Interface) {
diffs = append(diffs, fmt.Sprintf("numInterfaces: %d->%d", len(s.Interface), len(s2.Interface)))
}
if len(s.InterfaceIPs) != len(s2.InterfaceIPs) {
diffs = append(diffs, fmt.Sprintf("numInterfaceIPs: %d->%d", len(s.InterfaceIPs), len(s2.InterfaceIPs)))
}
for iname, i := range s.Interface {
i2, ok := s2.Interface[iname]
if !ok {
diffs = append(diffs, fmt.Sprintf("if %s: removed", iname))
continue
}
if !i.Equal(i2) {
if i.Interface != nil && i2.Interface != nil {
if i.Flags != i2.Flags {
diffs = append(diffs, fmt.Sprintf("if %s flags: %v->%v", iname, i.Flags, i2.Flags))
}
if i.MTU != i2.MTU {
diffs = append(diffs, fmt.Sprintf("if %s MTU: %d->%d", iname, i.MTU, i2.MTU))
}
if i.Index != i2.Index {
diffs = append(diffs, fmt.Sprintf("if %s index: %d->%d", iname, i.Index, i2.Index))
}
if !bytes.Equal([]byte(i.HardwareAddr), []byte(i2.HardwareAddr)) {
diffs = append(diffs, fmt.Sprintf("if %s hwaddr: %v->%v", iname, i.HardwareAddr, i2.HardwareAddr))
}
}
if i.Desc != i2.Desc {
diffs = append(diffs, fmt.Sprintf("if %s desc: %q->%q", iname, i.Desc, i2.Desc))
}
}
}
for iname := range s2.Interface {
if _, ok := s.Interface[iname]; !ok {
diffs = append(diffs, fmt.Sprintf("if %s: added", iname))
}
}
for iname, vv := range s.InterfaceIPs {
vv2 := s2.InterfaceIPs[iname]
if !slices.Equal(vv, vv2) {
diffs = append(diffs, fmt.Sprintf("ips %s: %v->%v", iname, vv, vv2))
}
}
for iname := range s2.InterfaceIPs {
if _, ok := s.InterfaceIPs[iname]; !ok {
diffs = append(diffs, fmt.Sprintf("ips %s: added %v", iname, s2.InterfaceIPs[iname]))
}
}
return strings.Join(diffs, "; ")
}
func (s *State) HasPAC() bool { return s != nil && s.PAC != "" }
// AnyInterfaceUp reports whether any interface seems like it has Internet access.