ipn/routecheck: track reachability changes incrementally

The routecheck client will watch the IPN bus for WireGuard engine
updates, ipn.NotifyWatchEngineUpdates, which supplies it with status
updates on a regular timer and also on immediate changes. The GUI
clients already use this mechanism over the Local Client API, so
we aren’t introducing any significant overhead.

Routecheck will track established WireGuard tunnels using these engine
updates. If a tunnel is established and this node is receiving traffic
over that link, then we can conclude that the destination peer is not
just reachable, but actively in use. Due to Cryptokey Routing, we
should prefer active tunnels to avoid breaking existing connections.

Updates #17366
Updates tailscale/corp#33033

Signed-off-by: Simon Law <sfllaw@tailscale.com>
This commit is contained in:
Simon Law
2026-04-01 11:57:25 -07:00
parent 06e08d3e01
commit 4d56dfdba0
2 changed files with 151 additions and 1 deletions

View File

@@ -0,0 +1,119 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package routecheck
import (
"context"
"net/netip"
"time"
"tailscale.com/ipn/ipnstate"
"tailscale.com/tailcfg"
"tailscale.com/types/key"
"tailscale.com/types/netmap"
"tailscale.com/util/set"
"tailscale.com/wgengine"
)
// Init loads the initial [netmap.NetworkMap] assuming that a peer is reachable
// if its connected to the control plane, i.e. [tailcfg.Hostinfo.Online] is set.
// Thats not necessarily true, but we must make early routing decisions
// before active probing is complete.
func (c *Client) init(nm *netmap.NetworkMap) {
var r = &Report{
reachable: make(set.Set[tailcfg.NodeID]),
}
nids := make(map[key.NodePublic]tailcfg.NodeID)
for _, n := range nm.Peers {
if !n.Valid() {
continue
}
if len(routes(n)) == 0 {
// Connectors, i.e. exit nodes or subnet routers,
// are the only nodes that are chosen by reachability.
// Peer with no routes dont need to be checked.
continue
}
if n.Online().Get() {
r.reachable.Add(n.ID())
nids[n.Key()] = n.ID()
}
}
r.Now = time.Now()
c.mu.Lock()
defer c.mu.Unlock()
c.report = r
c.nids = nids
}
// Watch compares the previous set of traffic flows to the current ones.
// If we are receiving data from a peer, then we know that it is reachable.
// Otherwise, we will need to actively probe that peer to be sure.
func (c *Client) watch(flows map[key.NodePublic]ipnstate.PeerStatusLite) {
c.mu.Lock()
defer c.mu.Unlock()
// TODO: consult the netmap to remove nodes that are gone and add new nodes.
prev := c.flows
for k, s := range c.flows {
if prev[k].RxBytes != s.RxBytes { // wraparound is possible
nid := c.nids[k]
c.report.reachable.Add(nid)
}
}
c.report.Now = time.Now()
c.flows = flows
// TODO: What do I do with good after this? Is this where we set the tripwire?
}
// Report generates and returns a reachability report by either
// passively checking for activity in each nodes [ipnstate.PeerStatusLite] or
// by actively probing.
func (c *Client) Report(ctx context.Context) (*Report, error) {
status := c.b.Status().Peer
r := Report{reachable: make(set.Set[tailcfg.NodeID])}
for pfx, peers := range c.RoutersByPrefix() {
for _, n := range peers {
nid := n.ID()
if _, ok := r.reachable[nid]; ok {
continue // Already probed
}
if st := status[n.Key()]; st != nil {
rx, tx := st.RxBytes, st.TxBytes
last := st.LastHandshake
// Check if the previous status is any good
}
}
}
r.Now = time.Now()
return &r, nil
}
// GetReport gets a report by probing all .
func (c *Client) UpdateReport(ctx context.Context, r *Report, routes []netip.Prefix) (*Report, error) {
return &Report{
Now: time.Now(),
}, nil
}
// TODO: The GUIs use something like NotifyWatchEngineUpdates on the ipnbus. We should do something similar, since that will update things every 2 seconds via c.b.pollRequestEngineStatus.
// We should also check ipn.NotifyInitialNetMap to just set Online for everything.
// StatusCallback
func (c *Client) setWgengineStatus(s *wgengine.Status, err error) {
if err != nil {
c.logf("wgengine status error: %v", err)
return
}
if s == nil {
c.logf("[unexpected] non-error wgengine update with status=nil: %v", s)
return
}
p := s.Peers
}

View File

@@ -20,6 +20,7 @@
"tailscale.com/ipn/ipnstate"
"tailscale.com/syncs"
"tailscale.com/tailcfg"
"tailscale.com/types/key"
"tailscale.com/types/logger"
"tailscale.com/types/netmap"
"tailscale.com/util/mak"
@@ -52,13 +53,21 @@ type Client struct {
Logf logger.Logf
// These elements are read-only after initialization.
b LocalBackend
b LocalBackend
cancel context.CancelFunc
// The mutex protects the following elements.
mu syncs.Mutex
report *Report
nids map[key.NodePublic]tailcfg.NodeID
flows map[key.NodePublic]ipnstate.PeerStatusLite
}
type LocalBackend interface {
NetMap() *netmap.NetworkMap
Peers() []tailcfg.NodeView
Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, size int) (*ipnstate.PingResult, error)
Status() *ipnstate.Status
WatchNotifications(ctx context.Context, mask ipn.NotifyWatchOpt, onWatchAdded func(), fn func(roNotify *ipn.Notify) (keepGoing bool))
WhoIs(proto string, ipp netip.AddrPort) (n tailcfg.NodeView, u tailcfg.UserProfile, ok bool)
}
@@ -282,6 +291,28 @@ func routes(n tailcfg.NodeView) []netip.Prefix {
return routes
}
// Start registers the client the [ipnlocal.LocalBackend]s IPN bus
// to bootstrap with the initial network map and to watch for traffic flows.
func (c *Client) Start(ctx context.Context) {
ctx, c.cancel = context.WithCancel(ctx)
opts := ipn.NotifyInitialNetMap | ipn.NotifyWatchEngineUpdates | ipn.NotifyRateLimit
c.b.WatchNotifications(ctx, opts, nil, func(n *ipn.Notify) bool {
if n.NetMap != nil {
c.init(n.NetMap)
}
if n.Engine != nil {
c.watch(n.Engine.LivePeers)
}
return true
})
}
// Close implements the [io.Closer] interface.
func (c *Client) Close() error {
c.cancel()
return nil
}
// EarlyExit is used to exit early out of a [ipnext.NodeBackend.AppendMatchingPeers] loop.
// It is a sentinel type used by panic and recover.
type earlyExit struct{}