mirror of
https://github.com/tailscale/tailscale.git
synced 2026-04-03 14:13:12 -04:00
tailcfg,ipn/ipnlocal: regulate netmap caching via a node attribute (#19117)
Add a new tailcfg.NodeCapability (NodeAttrCacheNetworkMaps) to control whether a node with support for caching network maps will attempt to do so. Update the capability version to reflect this change (mainly as a safety measure, as the control plane does not currently need to know about it). Use the presence (or absence) of the node attribute to decide whether to create and update a netmap cache for each profile. If caching is disabled, discard the cached data; this allows us to use the presence of a cached netmap as an indicator it should be used (unless explicitly overridden). Add a test that verifies the attribute is respected. Reverse the sense of the environment knob to be true by default, with an override to disable caching at the client regardless what the node attribute says. Move the creation/update of the netmap cache (when enabled) until after successfully applying the network map, to reduce the possibility that we will cache (and thus reuse after a restart) a network map that fails to correctly configure the client. Updates #12639 Change-Id: I1df4dd791fdb485c6472a9f741037db6ed20c47e Signed-off-by: M. J. Fromberger <fromberger@tailscale.com>
This commit is contained in:
@@ -54,3 +54,31 @@ func (b *LocalBackend) loadDiskCacheLocked() (om *netmap.NetworkMap, ok bool) {
|
||||
}
|
||||
return nm, true
|
||||
}
|
||||
|
||||
// discardDiskCacheLocked removes a cached network map for the current node, if
|
||||
// one exists, and disables the cache.
|
||||
func (b *LocalBackend) discardDiskCacheLocked() {
|
||||
if !buildfeatures.HasCacheNetMap {
|
||||
return
|
||||
}
|
||||
if b.diskCache.cache == nil {
|
||||
return // nothing to do, we do not have a cache
|
||||
}
|
||||
|
||||
// Reaching here, we have a cache directory that needs to be purged.
|
||||
// Log errors but do not fail for them.
|
||||
store := netmapcache.FileStore(b.diskCache.dir)
|
||||
ctx := b.currentNode().Context()
|
||||
for key, err := range store.List(ctx, "") {
|
||||
if err != nil {
|
||||
b.logf("listing cache contents: %v", err)
|
||||
break
|
||||
}
|
||||
if err := store.Remove(ctx, key); err != nil {
|
||||
b.logf("discarding cache key %q: %v", key, err)
|
||||
}
|
||||
}
|
||||
|
||||
b.diskCache.cache = nil // drop reference
|
||||
b.diskCache.dir = ""
|
||||
}
|
||||
|
||||
@@ -2609,7 +2609,21 @@ func (b *LocalBackend) startLocked(opts ipn.Options) error {
|
||||
persistv = new(persist.Persist)
|
||||
}
|
||||
|
||||
if envknob.Bool("TS_USE_CACHED_NETMAP") {
|
||||
// At this point we do not yet know whether we are meant to cache netmaps by
|
||||
// policy (as we have not yet spoken to the control plane).
|
||||
//
|
||||
// However, since we do not create or update a netmap cache unless we observe the
|
||||
// [tailcfg.NodeAttrCachedNetworkMaps] capability, we can use the presence
|
||||
// of the cached netmap as a signal that we were expected to do so as of the
|
||||
// last time we updated the cache.
|
||||
//
|
||||
// If the policy has (since) changed, a subsequent network map from the control
|
||||
// plane may remove the attribute, at which point we will drop the cache.
|
||||
//
|
||||
// As of 2026-03-25 we require the envknob set to read a cached netmap, with
|
||||
// the envknob defaulted to true so we can use it as a safety override
|
||||
// during rollout.
|
||||
if envknob.BoolDefaultTrue("TS_USE_CACHED_NETMAP") {
|
||||
if nm, ok := b.loadDiskCacheLocked(); ok {
|
||||
logf("loaded netmap from disk cache; %d peers", len(nm.Peers))
|
||||
b.setControlClientStatusLocked(nil, controlclient.Status{
|
||||
@@ -6336,11 +6350,6 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
|
||||
var login string
|
||||
if nm != nil {
|
||||
login = cmp.Or(profileFromView(nm.UserProfiles[nm.User()]).LoginName, "<missing-profile>")
|
||||
if envknob.Bool("TS_USE_CACHED_NETMAP") {
|
||||
if err := b.writeNetmapToDiskLocked(nm); err != nil {
|
||||
b.logf("write netmap to cache: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
b.currentNode().SetNetMap(nm)
|
||||
if ms, ok := b.sys.MagicSock.GetOK(); ok {
|
||||
@@ -6434,6 +6443,29 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
|
||||
f(b, nm)
|
||||
}
|
||||
}
|
||||
|
||||
// Reaching here, we have successfully applied a new network map, and must
|
||||
// now (if configured) update the cache. We do this after application to
|
||||
// reduce the chance we will cache a QoD netmap.
|
||||
//
|
||||
// As of 2026-03-25 we require the envknob AND the node attribute to use
|
||||
// a netmap cache, with the envknob defaulted to true so we can use it as
|
||||
// a safety override during rollout.
|
||||
//
|
||||
// We treat the envknob being false as identical to disabling the feature
|
||||
// by policy, and clean up the cache on that basis. That ensures we will
|
||||
// not wind up in a situation where we have a stale cached netmap that is
|
||||
// not being updated (because of the envknob) and could be read back when
|
||||
// the node starts up.
|
||||
if nm != nil {
|
||||
if b.currentNode().SelfHasCap(tailcfg.NodeAttrCacheNetworkMaps) && envknob.BoolDefaultTrue("TS_USE_CACHED_NETMAP") {
|
||||
if err := b.writeNetmapToDiskLocked(nm); err != nil {
|
||||
b.logf("write netmap to cache: %v", err)
|
||||
}
|
||||
} else {
|
||||
b.discardDiskCacheLocked()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var hookSetNetMapLockedDrive feature.Hook[func(*LocalBackend, *netmap.NetworkMap)]
|
||||
|
||||
@@ -710,6 +710,118 @@ func TestLoadCachedNetMap(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateNetMapCache(t *testing.T) {
|
||||
t.Setenv("TS_USE_CACHED_NETMAP", "1")
|
||||
|
||||
// Set up a cache directory so we can check what happens to it, in response
|
||||
// to netmap updates.
|
||||
varRoot := t.TempDir()
|
||||
cacheDir := filepath.Join(varRoot, "profile-data", "id0", "netmap-cache")
|
||||
|
||||
testMap := &netmap.NetworkMap{
|
||||
SelfNode: (&tailcfg.Node{
|
||||
Name: "example.ts.net",
|
||||
User: tailcfg.UserID(1),
|
||||
Addresses: []netip.Prefix{
|
||||
netip.MustParsePrefix("100.2.3.4/32"),
|
||||
},
|
||||
}).View(),
|
||||
UserProfiles: map[tailcfg.UserID]tailcfg.UserProfileView{
|
||||
tailcfg.UserID(1): (&tailcfg.UserProfile{
|
||||
ID: 1,
|
||||
LoginName: "amelie@example.com",
|
||||
DisplayName: "Amelie du Pangoline",
|
||||
}).View(),
|
||||
},
|
||||
Peers: []tailcfg.NodeView{
|
||||
(&tailcfg.Node{
|
||||
ID: 601,
|
||||
StableID: "n601FAKE",
|
||||
ComputedName: "some-peer",
|
||||
User: tailcfg.UserID(1),
|
||||
Key: makeNodeKeyFromID(601),
|
||||
Addresses: []netip.Prefix{
|
||||
netip.MustParsePrefix("100.2.3.5/32"),
|
||||
},
|
||||
}).View(),
|
||||
},
|
||||
}
|
||||
|
||||
// Make a new backend to which we can send network maps to test that
|
||||
// netmap caching decisions are made appropriately.
|
||||
sys := tsd.NewSystem()
|
||||
e, err := wgengine.NewFakeUserspaceEngine(logger.Discard,
|
||||
sys.Set,
|
||||
sys.HealthTracker.Get(),
|
||||
sys.UserMetricsRegistry(),
|
||||
sys.Bus.Get(),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("Make userspace engine: %v", err)
|
||||
}
|
||||
t.Cleanup(e.Close)
|
||||
sys.Set(e)
|
||||
sys.Set(new(mem.Store))
|
||||
|
||||
logf := tstest.WhileTestRunningLogger(t)
|
||||
clb, err := NewLocalBackend(logf, logid.PublicID{}, sys, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("Make local backend: %v", err)
|
||||
}
|
||||
t.Cleanup(clb.Shutdown)
|
||||
clb.SetVarRoot(varRoot)
|
||||
|
||||
pm := must.Get(newProfileManager(new(mem.Store), logf, health.NewTracker(sys.Bus.Get())))
|
||||
pm.currentProfile = (&ipn.LoginProfile{ID: "id0"}).View()
|
||||
clb.pm = pm
|
||||
if err := clb.Start(ipn.Options{}); err != nil {
|
||||
t.Fatalf("Start local backend: %v", err)
|
||||
}
|
||||
|
||||
wantCacheEmpty := func() {
|
||||
// The cache directory should be empty, as caching is not enabled.
|
||||
if des, err := os.ReadDir(cacheDir); err != nil {
|
||||
t.Errorf("List cache directory: %v", err)
|
||||
} else if len(des) != 0 {
|
||||
t.Errorf("Cache directory has %d items, want 0\n%+v", len(des), des)
|
||||
}
|
||||
}
|
||||
|
||||
// Send the initial network map to the backend. Because the map does not
|
||||
// include the cache attribute, no cache should be written.
|
||||
clb.mu.Lock()
|
||||
clb.setNetMapLocked(testMap)
|
||||
clb.mu.Unlock()
|
||||
|
||||
wantCacheEmpty()
|
||||
|
||||
// Now enable the netmap caching attribute, and send another update.
|
||||
// After doing so, the cache should have real data in it.
|
||||
testMap.AllCaps = set.Of(tailcfg.NodeAttrCacheNetworkMaps)
|
||||
|
||||
clb.mu.Lock()
|
||||
clb.setNetMapLocked(testMap)
|
||||
clb.mu.Unlock()
|
||||
|
||||
if des, err := os.ReadDir(cacheDir); err != nil {
|
||||
t.Errorf("List cache directory: %v", err)
|
||||
} else if len(des) == 0 {
|
||||
t.Error("Cache is unexpectedly empty")
|
||||
} else {
|
||||
t.Logf("Cache directory has %d entries (OK)", len(des))
|
||||
}
|
||||
|
||||
// Now disable the node attribute again, send another update, and verify
|
||||
// that the cache got cleaned up.
|
||||
testMap.AllCaps = nil
|
||||
|
||||
clb.mu.Lock()
|
||||
clb.setNetMapLocked(testMap)
|
||||
clb.mu.Unlock()
|
||||
|
||||
wantCacheEmpty()
|
||||
}
|
||||
|
||||
func TestConfigureExitNode(t *testing.T) {
|
||||
controlURL := "https://localhost:1/"
|
||||
exitNode1 := makeExitNode(1, withName("node-1"), withDERP(1), withAddresses(netip.MustParsePrefix("100.64.1.1/32")))
|
||||
|
||||
@@ -1177,7 +1177,7 @@ func (t *Wrapper) filterPacketInboundFromWireGuard(p *packet.Parsed, captHook pa
|
||||
t.injectOutboundPong(p, pingReq)
|
||||
return filter.DropSilently, gro
|
||||
} else if discoKeyAdvert, ok := p.AsTSMPDiscoAdvertisement(); ok {
|
||||
if buildfeatures.HasCacheNetMap && envknob.Bool("TS_USE_CACHED_NETMAP") {
|
||||
if buildfeatures.HasCacheNetMap && envknob.BoolDefaultTrue("TS_USE_CACHED_NETMAP") {
|
||||
t.discoKeyAdvertisementPub.Publish(events.DiscoKeyAdvertisement{
|
||||
Src: discoKeyAdvert.Src,
|
||||
Key: discoKeyAdvert.Key,
|
||||
|
||||
@@ -181,7 +181,8 @@
|
||||
// - 132: 2026-02-13: client respects [NodeAttrDisableHostsFileUpdates]
|
||||
// - 133: 2026-02-17: client understands [NodeAttrForceRegisterMagicDNSIPv4Only]; MagicDNS IPv6 registered w/ OS by default
|
||||
// - 134: 2026-03-09: Client understands [NodeAttrDisableAndroidBindToActiveNetwork]
|
||||
const CurrentCapabilityVersion CapabilityVersion = 134
|
||||
// - 135: 2026-03-30: Client understands [NodeAttrCacheNetworkMaps]
|
||||
const CurrentCapabilityVersion CapabilityVersion = 135
|
||||
|
||||
// ID is an integer ID for a user, node, or login allocated by the
|
||||
// control plane.
|
||||
@@ -2770,6 +2771,13 @@ type Oauth2Token struct {
|
||||
// See https://github.com/tailscale/tailscale/issues/15404.
|
||||
// TODO(bradfitz): remove this a few releases after 2026-02-16.
|
||||
NodeAttrForceRegisterMagicDNSIPv4Only NodeCapability = "force-register-magicdns-ipv4-only"
|
||||
|
||||
// NodeAttrCacheNetworkMaps instructs the node to persistently cache network
|
||||
// maps and use them to establish peer connectivity on start, if doing so is
|
||||
// supported by the client and storage is available. When this attribute is
|
||||
// absent (or removed), a node that supports netmap caching will ignore and
|
||||
// discard existing cached maps, and will not store any.
|
||||
NodeAttrCacheNetworkMaps NodeCapability = "cache-network-maps"
|
||||
)
|
||||
|
||||
// SetDNSRequest is a request to add a DNS record.
|
||||
|
||||
@@ -4329,7 +4329,7 @@ type NewDiscoKeyAvailable struct {
|
||||
//
|
||||
// We do not need the Conn to be locked, but the endpoint should be.
|
||||
func (c *Conn) maybeSendTSMPDiscoAdvert(de *endpoint) {
|
||||
if !buildfeatures.HasCacheNetMap || !envknob.Bool("TS_USE_CACHED_NETMAP") {
|
||||
if !buildfeatures.HasCacheNetMap || !envknob.BoolDefaultTrue("TS_USE_CACHED_NETMAP") {
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user