From 4d56dfdba00a457c52fbb6bd234bba39facf1806 Mon Sep 17 00:00:00 2001 From: Simon Law Date: Wed, 1 Apr 2026 11:57:25 -0700 Subject: [PATCH] ipn/routecheck: track reachability changes incrementally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The routecheck client will watch the IPN bus for WireGuard engine updates, ipn.NotifyWatchEngineUpdates, which supplies it with status updates on a regular timer and also on immediate changes. The GUI clients already use this mechanism over the Local Client API, so we aren’t introducing any significant overhead. Routecheck will track established WireGuard tunnels using these engine updates. If a tunnel is established and this node is receiving traffic over that link, then we can conclude that the destination peer is not just reachable, but actively in use. Due to Cryptokey Routing, we should prefer active tunnels to avoid breaking existing connections. Updates #17366 Updates tailscale/corp#33033 Signed-off-by: Simon Law --- ipn/routecheck/incremental.go | 119 ++++++++++++++++++++++++++++++++++ ipn/routecheck/routecheck.go | 33 +++++++++- 2 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 ipn/routecheck/incremental.go diff --git a/ipn/routecheck/incremental.go b/ipn/routecheck/incremental.go new file mode 100644 index 000000000..4387c469c --- /dev/null +++ b/ipn/routecheck/incremental.go @@ -0,0 +1,119 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +package routecheck + +import ( + "context" + "net/netip" + "time" + + "tailscale.com/ipn/ipnstate" + "tailscale.com/tailcfg" + "tailscale.com/types/key" + "tailscale.com/types/netmap" + "tailscale.com/util/set" + "tailscale.com/wgengine" +) + +// Init loads the initial [netmap.NetworkMap] assuming that a peer is reachable +// if it’s connected to the control plane, i.e. [tailcfg.Hostinfo.Online] is set. +// That’s not necessarily true, but we must make early routing decisions +// before active probing is complete. +func (c *Client) init(nm *netmap.NetworkMap) { + var r = &Report{ + reachable: make(set.Set[tailcfg.NodeID]), + } + + nids := make(map[key.NodePublic]tailcfg.NodeID) + for _, n := range nm.Peers { + if !n.Valid() { + continue + } + if len(routes(n)) == 0 { + // Connectors, i.e. exit nodes or subnet routers, + // are the only nodes that are chosen by reachability. + // Peer with no routes don’t need to be checked. + continue + } + if n.Online().Get() { + r.reachable.Add(n.ID()) + nids[n.Key()] = n.ID() + } + } + r.Now = time.Now() + + c.mu.Lock() + defer c.mu.Unlock() + c.report = r + c.nids = nids +} + +// Watch compares the previous set of traffic flows to the current ones. +// If we are receiving data from a peer, then we know that it is reachable. +// Otherwise, we will need to actively probe that peer to be sure. +func (c *Client) watch(flows map[key.NodePublic]ipnstate.PeerStatusLite) { + c.mu.Lock() + defer c.mu.Unlock() + + // TODO: consult the netmap to remove nodes that are gone and add new nodes. + + prev := c.flows + for k, s := range c.flows { + if prev[k].RxBytes != s.RxBytes { // wraparound is possible + nid := c.nids[k] + c.report.reachable.Add(nid) + } + } + c.report.Now = time.Now() + c.flows = flows + + // TODO: What do I do with good after this? Is this where we set the tripwire? +} + +// Report generates and returns a reachability report by either +// passively checking for activity in each node’s [ipnstate.PeerStatusLite] or +// by actively probing. +func (c *Client) Report(ctx context.Context) (*Report, error) { + status := c.b.Status().Peer + r := Report{reachable: make(set.Set[tailcfg.NodeID])} + for pfx, peers := range c.RoutersByPrefix() { + for _, n := range peers { + nid := n.ID() + if _, ok := r.reachable[nid]; ok { + continue // Already probed + } + + if st := status[n.Key()]; st != nil { + rx, tx := st.RxBytes, st.TxBytes + last := st.LastHandshake + // Check if the previous status is any good + } + } + } + r.Now = time.Now() + return &r, nil +} + +// GetReport gets a report by probing all . +func (c *Client) UpdateReport(ctx context.Context, r *Report, routes []netip.Prefix) (*Report, error) { + return &Report{ + Now: time.Now(), + }, nil +} + +// TODO: The GUIs use something like NotifyWatchEngineUpdates on the ipnbus. We should do something similar, since that will update things every 2 seconds via c.b.pollRequestEngineStatus. +// We should also check ipn.NotifyInitialNetMap to just set Online for everything. +// StatusCallback +func (c *Client) setWgengineStatus(s *wgengine.Status, err error) { + if err != nil { + c.logf("wgengine status error: %v", err) + return + } + if s == nil { + c.logf("[unexpected] non-error wgengine update with status=nil: %v", s) + return + } + p := s.Peers + +} diff --git a/ipn/routecheck/routecheck.go b/ipn/routecheck/routecheck.go index 635260ae3..7d003e4c3 100644 --- a/ipn/routecheck/routecheck.go +++ b/ipn/routecheck/routecheck.go @@ -20,6 +20,7 @@ "tailscale.com/ipn/ipnstate" "tailscale.com/syncs" "tailscale.com/tailcfg" + "tailscale.com/types/key" "tailscale.com/types/logger" "tailscale.com/types/netmap" "tailscale.com/util/mak" @@ -52,13 +53,21 @@ type Client struct { Logf logger.Logf // These elements are read-only after initialization. - b LocalBackend + b LocalBackend + cancel context.CancelFunc + + // The mutex protects the following elements. + mu syncs.Mutex + report *Report + nids map[key.NodePublic]tailcfg.NodeID + flows map[key.NodePublic]ipnstate.PeerStatusLite } type LocalBackend interface { NetMap() *netmap.NetworkMap Peers() []tailcfg.NodeView Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, size int) (*ipnstate.PingResult, error) + Status() *ipnstate.Status WatchNotifications(ctx context.Context, mask ipn.NotifyWatchOpt, onWatchAdded func(), fn func(roNotify *ipn.Notify) (keepGoing bool)) WhoIs(proto string, ipp netip.AddrPort) (n tailcfg.NodeView, u tailcfg.UserProfile, ok bool) } @@ -282,6 +291,28 @@ func routes(n tailcfg.NodeView) []netip.Prefix { return routes } +// Start registers the client the [ipnlocal.LocalBackend]’s IPN bus +// to bootstrap with the initial network map and to watch for traffic flows. +func (c *Client) Start(ctx context.Context) { + ctx, c.cancel = context.WithCancel(ctx) + opts := ipn.NotifyInitialNetMap | ipn.NotifyWatchEngineUpdates | ipn.NotifyRateLimit + c.b.WatchNotifications(ctx, opts, nil, func(n *ipn.Notify) bool { + if n.NetMap != nil { + c.init(n.NetMap) + } + if n.Engine != nil { + c.watch(n.Engine.LivePeers) + } + return true + }) +} + +// Close implements the [io.Closer] interface. +func (c *Client) Close() error { + c.cancel() + return nil +} + // EarlyExit is used to exit early out of a [ipnext.NodeBackend.AppendMatchingPeers] loop. // It is a sentinel type used by panic and recover. type earlyExit struct{}