diff --git a/ipn/ipnlocal/diskcache.go b/ipn/ipnlocal/diskcache.go index 0b1b7b448..4de47cca2 100644 --- a/ipn/ipnlocal/diskcache.go +++ b/ipn/ipnlocal/diskcache.go @@ -54,3 +54,31 @@ func (b *LocalBackend) loadDiskCacheLocked() (om *netmap.NetworkMap, ok bool) { } return nm, true } + +// discardDiskCacheLocked removes a cached network map for the current node, if +// one exists, and disables the cache. +func (b *LocalBackend) discardDiskCacheLocked() { + if !buildfeatures.HasCacheNetMap { + return + } + if b.diskCache.cache == nil { + return // nothing to do, we do not have a cache + } + + // Reaching here, we have a cache directory that needs to be purged. + // Log errors but do not fail for them. + store := netmapcache.FileStore(b.diskCache.dir) + ctx := b.currentNode().Context() + for key, err := range store.List(ctx, "") { + if err != nil { + b.logf("listing cache contents: %v", err) + break + } + if err := store.Remove(ctx, key); err != nil { + b.logf("discarding cache key %q: %v", key, err) + } + } + + b.diskCache.cache = nil // drop reference + b.diskCache.dir = "" +} diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index edeb2967b..5d3bbd36e 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -2609,7 +2609,21 @@ func (b *LocalBackend) startLocked(opts ipn.Options) error { persistv = new(persist.Persist) } - if envknob.Bool("TS_USE_CACHED_NETMAP") { + // At this point we do not yet know whether we are meant to cache netmaps by + // policy (as we have not yet spoken to the control plane). + // + // However, since we do not create or update a netmap cache unless we observe the + // [tailcfg.NodeAttrCachedNetworkMaps] capability, we can use the presence + // of the cached netmap as a signal that we were expected to do so as of the + // last time we updated the cache. + // + // If the policy has (since) changed, a subsequent network map from the control + // plane may remove the attribute, at which point we will drop the cache. + // + // As of 2026-03-25 we require the envknob set to read a cached netmap, with + // the envknob defaulted to true so we can use it as a safety override + // during rollout. + if envknob.BoolDefaultTrue("TS_USE_CACHED_NETMAP") { if nm, ok := b.loadDiskCacheLocked(); ok { logf("loaded netmap from disk cache; %d peers", len(nm.Peers)) b.setControlClientStatusLocked(nil, controlclient.Status{ @@ -6336,11 +6350,6 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { var login string if nm != nil { login = cmp.Or(profileFromView(nm.UserProfiles[nm.User()]).LoginName, "") - if envknob.Bool("TS_USE_CACHED_NETMAP") { - if err := b.writeNetmapToDiskLocked(nm); err != nil { - b.logf("write netmap to cache: %v", err) - } - } } b.currentNode().SetNetMap(nm) if ms, ok := b.sys.MagicSock.GetOK(); ok { @@ -6434,6 +6443,29 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { f(b, nm) } } + + // Reaching here, we have successfully applied a new network map, and must + // now (if configured) update the cache. We do this after application to + // reduce the chance we will cache a QoD netmap. + // + // As of 2026-03-25 we require the envknob AND the node attribute to use + // a netmap cache, with the envknob defaulted to true so we can use it as + // a safety override during rollout. + // + // We treat the envknob being false as identical to disabling the feature + // by policy, and clean up the cache on that basis. That ensures we will + // not wind up in a situation where we have a stale cached netmap that is + // not being updated (because of the envknob) and could be read back when + // the node starts up. + if nm != nil { + if b.currentNode().SelfHasCap(tailcfg.NodeAttrCacheNetworkMaps) && envknob.BoolDefaultTrue("TS_USE_CACHED_NETMAP") { + if err := b.writeNetmapToDiskLocked(nm); err != nil { + b.logf("write netmap to cache: %v", err) + } + } else { + b.discardDiskCacheLocked() + } + } } var hookSetNetMapLockedDrive feature.Hook[func(*LocalBackend, *netmap.NetworkMap)] diff --git a/ipn/ipnlocal/local_test.go b/ipn/ipnlocal/local_test.go index bac9e0418..7ca8d8ba0 100644 --- a/ipn/ipnlocal/local_test.go +++ b/ipn/ipnlocal/local_test.go @@ -710,6 +710,118 @@ func TestLoadCachedNetMap(t *testing.T) { } } +func TestUpdateNetMapCache(t *testing.T) { + t.Setenv("TS_USE_CACHED_NETMAP", "1") + + // Set up a cache directory so we can check what happens to it, in response + // to netmap updates. + varRoot := t.TempDir() + cacheDir := filepath.Join(varRoot, "profile-data", "id0", "netmap-cache") + + testMap := &netmap.NetworkMap{ + SelfNode: (&tailcfg.Node{ + Name: "example.ts.net", + User: tailcfg.UserID(1), + Addresses: []netip.Prefix{ + netip.MustParsePrefix("100.2.3.4/32"), + }, + }).View(), + UserProfiles: map[tailcfg.UserID]tailcfg.UserProfileView{ + tailcfg.UserID(1): (&tailcfg.UserProfile{ + ID: 1, + LoginName: "amelie@example.com", + DisplayName: "Amelie du Pangoline", + }).View(), + }, + Peers: []tailcfg.NodeView{ + (&tailcfg.Node{ + ID: 601, + StableID: "n601FAKE", + ComputedName: "some-peer", + User: tailcfg.UserID(1), + Key: makeNodeKeyFromID(601), + Addresses: []netip.Prefix{ + netip.MustParsePrefix("100.2.3.5/32"), + }, + }).View(), + }, + } + + // Make a new backend to which we can send network maps to test that + // netmap caching decisions are made appropriately. + sys := tsd.NewSystem() + e, err := wgengine.NewFakeUserspaceEngine(logger.Discard, + sys.Set, + sys.HealthTracker.Get(), + sys.UserMetricsRegistry(), + sys.Bus.Get(), + ) + if err != nil { + t.Fatalf("Make userspace engine: %v", err) + } + t.Cleanup(e.Close) + sys.Set(e) + sys.Set(new(mem.Store)) + + logf := tstest.WhileTestRunningLogger(t) + clb, err := NewLocalBackend(logf, logid.PublicID{}, sys, 0) + if err != nil { + t.Fatalf("Make local backend: %v", err) + } + t.Cleanup(clb.Shutdown) + clb.SetVarRoot(varRoot) + + pm := must.Get(newProfileManager(new(mem.Store), logf, health.NewTracker(sys.Bus.Get()))) + pm.currentProfile = (&ipn.LoginProfile{ID: "id0"}).View() + clb.pm = pm + if err := clb.Start(ipn.Options{}); err != nil { + t.Fatalf("Start local backend: %v", err) + } + + wantCacheEmpty := func() { + // The cache directory should be empty, as caching is not enabled. + if des, err := os.ReadDir(cacheDir); err != nil { + t.Errorf("List cache directory: %v", err) + } else if len(des) != 0 { + t.Errorf("Cache directory has %d items, want 0\n%+v", len(des), des) + } + } + + // Send the initial network map to the backend. Because the map does not + // include the cache attribute, no cache should be written. + clb.mu.Lock() + clb.setNetMapLocked(testMap) + clb.mu.Unlock() + + wantCacheEmpty() + + // Now enable the netmap caching attribute, and send another update. + // After doing so, the cache should have real data in it. + testMap.AllCaps = set.Of(tailcfg.NodeAttrCacheNetworkMaps) + + clb.mu.Lock() + clb.setNetMapLocked(testMap) + clb.mu.Unlock() + + if des, err := os.ReadDir(cacheDir); err != nil { + t.Errorf("List cache directory: %v", err) + } else if len(des) == 0 { + t.Error("Cache is unexpectedly empty") + } else { + t.Logf("Cache directory has %d entries (OK)", len(des)) + } + + // Now disable the node attribute again, send another update, and verify + // that the cache got cleaned up. + testMap.AllCaps = nil + + clb.mu.Lock() + clb.setNetMapLocked(testMap) + clb.mu.Unlock() + + wantCacheEmpty() +} + func TestConfigureExitNode(t *testing.T) { controlURL := "https://localhost:1/" exitNode1 := makeExitNode(1, withName("node-1"), withDERP(1), withAddresses(netip.MustParsePrefix("100.64.1.1/32"))) diff --git a/net/tstun/wrap.go b/net/tstun/wrap.go index d67a25eeb..1b28eb157 100644 --- a/net/tstun/wrap.go +++ b/net/tstun/wrap.go @@ -1177,7 +1177,7 @@ func (t *Wrapper) filterPacketInboundFromWireGuard(p *packet.Parsed, captHook pa t.injectOutboundPong(p, pingReq) return filter.DropSilently, gro } else if discoKeyAdvert, ok := p.AsTSMPDiscoAdvertisement(); ok { - if buildfeatures.HasCacheNetMap && envknob.Bool("TS_USE_CACHED_NETMAP") { + if buildfeatures.HasCacheNetMap && envknob.BoolDefaultTrue("TS_USE_CACHED_NETMAP") { t.discoKeyAdvertisementPub.Publish(events.DiscoKeyAdvertisement{ Src: discoKeyAdvert.Src, Key: discoKeyAdvert.Key, diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 04389faba..b976dcc47 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -181,7 +181,8 @@ // - 132: 2026-02-13: client respects [NodeAttrDisableHostsFileUpdates] // - 133: 2026-02-17: client understands [NodeAttrForceRegisterMagicDNSIPv4Only]; MagicDNS IPv6 registered w/ OS by default // - 134: 2026-03-09: Client understands [NodeAttrDisableAndroidBindToActiveNetwork] -const CurrentCapabilityVersion CapabilityVersion = 134 +// - 135: 2026-03-30: Client understands [NodeAttrCacheNetworkMaps] +const CurrentCapabilityVersion CapabilityVersion = 135 // ID is an integer ID for a user, node, or login allocated by the // control plane. @@ -2770,6 +2771,13 @@ type Oauth2Token struct { // See https://github.com/tailscale/tailscale/issues/15404. // TODO(bradfitz): remove this a few releases after 2026-02-16. NodeAttrForceRegisterMagicDNSIPv4Only NodeCapability = "force-register-magicdns-ipv4-only" + + // NodeAttrCacheNetworkMaps instructs the node to persistently cache network + // maps and use them to establish peer connectivity on start, if doing so is + // supported by the client and storage is available. When this attribute is + // absent (or removed), a node that supports netmap caching will ignore and + // discard existing cached maps, and will not store any. + NodeAttrCacheNetworkMaps NodeCapability = "cache-network-maps" ) // SetDNSRequest is a request to add a DNS record. diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index 5938a3096..6a2e9c39c 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -4329,7 +4329,7 @@ type NewDiscoKeyAvailable struct { // // We do not need the Conn to be locked, but the endpoint should be. func (c *Conn) maybeSendTSMPDiscoAdvert(de *endpoint) { - if !buildfeatures.HasCacheNetMap || !envknob.Bool("TS_USE_CACHED_NETMAP") { + if !buildfeatures.HasCacheNetMap || !envknob.BoolDefaultTrue("TS_USE_CACHED_NETMAP") { return }