ipn/routecheck: introduce new package for checking peer reachability

The routecheck package parallels the netcheck package, where the
former checks routes and routers while the latter checks networks.
Like netcheck, it compiles reports for other systems to consume.

Historically, the client has never known whether a peer is actually
reachable. Most of the time this doesn’t matter, since the client will
want to establish a WireGuard tunnel to any given destination.
However, if the client needs to choose between two or more nodes,
then it should only choose a node that it can reach.

Suggested exit nodes are one such example, where the client filters
out any nodes that aren’t connected to the control plane. Sometimes an
exit node will get disconnected from the control plane: when the
network between the two is unreliable or when the exit node is too
busy to keep its control connection alive. In these cases, Control
disables the Node.Online flag for the exit node and broadcasts this
across the tailnet. Arguably, the client should never have relied on
this flag, since it only makes sense in the admin console.

This PR implements an initial routecheck client that can probe every
node that your client knows about. You should not ping scan your
visible tailnet, this method is for debugging only.

Updates #17366
Updates tailscale/corp#33033

Signed-off-by: Simon Law <sfllaw@tailscale.com>
This commit is contained in:
Simon Law
2026-04-01 11:06:18 -07:00
parent 211ef67222
commit 9bfaa54e8c
3 changed files with 279 additions and 0 deletions

View File

@@ -1570,6 +1570,11 @@ func (b *LocalBackend) PeerCaps(src netip.Addr) tailcfg.PeerCapMap {
return b.currentNode().PeerCaps(src)
}
// Peers returns all the current peers in an undefined order.
func (b *LocalBackend) Peers() []tailcfg.NodeView {
return b.currentNode().Peers()
}
func (b *LocalBackend) GetFilterForTest() *filter.Filter {
testenv.AssertInTest()
nb := b.currentNode()

22
ipn/routecheck/log.go Normal file
View File

@@ -0,0 +1,22 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package routecheck
import (
"log"
)
func (c *Client) logf(format string, a ...any) {
if c.Logf != nil {
c.Logf(format, a...)
} else {
log.Printf(format, a...)
}
}
func (c *Client) vlogf(format string, a ...any) {
if c.Verbose || debugRoutecheck() {
c.logf(format, a...)
}
}

View File

@@ -0,0 +1,252 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
// Package routecheck performs status checks for routes from the current host.
package routecheck
import (
"cmp"
"context"
"errors"
"iter"
"math/rand/v2"
"net/netip"
"slices"
"time"
"golang.org/x/sync/errgroup"
"tailscale.com/envknob"
"tailscale.com/ipn"
"tailscale.com/ipn/ipnstate"
"tailscale.com/syncs"
"tailscale.com/tailcfg"
"tailscale.com/types/logger"
"tailscale.com/types/netmap"
"tailscale.com/util/mak"
"tailscale.com/util/set"
)
// Debugging and experimentation tweakables.
var (
debugRoutecheck = envknob.RegisterBool("TS_DEBUG_ROUTECHECK")
)
// Report contains the result of a single routecheck.
type Report struct {
// Now is the time when the report was finished.
Now time.Time
// Reachable is the set of nodes that were reachable from the current host
// when this report was compiled. Missing nodes may or may not be reachable.
reachable set.Set[tailcfg.NodeID]
}
// Client generates Reports describing the result of both passive and active
// reachability probing.
type Client struct {
// Verbose enables verbose logging.
Verbose bool
// Logf optionally specifies where to log to.
// If nil, log.Printf is used.
Logf logger.Logf
// These elements are read-only after initialization.
b LocalBackend
}
type LocalBackend interface {
NetMap() *netmap.NetworkMap
Peers() []tailcfg.NodeView
Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, size int) (*ipnstate.PingResult, error)
WatchNotifications(ctx context.Context, mask ipn.NotifyWatchOpt, onWatchAdded func(), fn func(roNotify *ipn.Notify) (keepGoing bool))
}
// NewClient returns a client that probes using this [ipnlocal.LocalBackend].
func NewClient(b LocalBackend) (*Client, error) {
if b == nil {
return nil, errors.New("LocalBackend must be set")
}
return &Client{b: b}, nil
}
// Probe actively probes the sequence of nodes and returns a reachability [Report].
// If limit is positive, it limits the number of concurrent active probes;
// a limit of zero will ping every node at once.
// This function tries both the IPv4 and IPv6 addresses
func (c *Client) Probe(ctx context.Context, nodes iter.Seq[tailcfg.NodeView], limit int) (*Report, error) {
var canIPv4, canIPv6 bool
for _, ip := range c.b.NetMap().SelfNode.Addresses().All() {
addr := ip.Addr()
if addr.Is4() {
canIPv4 = true
} else if addr.Is6() {
canIPv6 = true
}
}
g, ctx := errgroup.WithContext(ctx)
if limit > 0 {
g.SetLimit(limit)
}
var (
mu syncs.Mutex
r = &Report{
reachable: make(set.Set[tailcfg.NodeID]),
}
)
for n := range nodes {
// Ping one of the tailnet addresses.
for _, ip := range n.Addresses().All() {
// Skip this probe if there is an IP version mismatch.
addr := ip.Addr()
if addr.Is4() && !canIPv4 {
continue
}
if addr.Is6() && !canIPv6 {
continue
}
g.Go(func() error {
nid := n.ID()
pong, err := c.b.Ping(ctx, addr, tailcfg.PingTSMP, 0)
if err != nil {
// Returning an error would cancel the errgroup.
c.vlogf("ping %s (%s): error: %v", addr, nid, err)
} else {
c.vlogf("ping %s (%s): result: %f ms (err: %v)", addr, nid, pong.LatencySeconds*1000, pong.Err)
}
mu.Lock()
defer mu.Unlock()
r.reachable.Add(nid)
return nil
})
break
}
}
g.Wait()
r.Now = time.Now()
return r, nil
}
// ProbeAllPeers actively probes all peers in parallel and returns a [Report]
// that identifies which nodes are reachable. If limit is positive, it limits
// the number of concurrent active probes; a limit of zero will ping every
// candidate at once.
func (c *Client) ProbeAllPeers(ctx context.Context, limit int) (*Report, error) {
nm := c.waitForInitialNetMap(ctx)
return c.Probe(ctx, slices.Values(nm.Peers), limit)
}
// ProbeAllHARouters actively probes all High Availability routers in parallel
// and returns a [Report] that identifies which of these routers are reachable.
// If limit is positive, it limits the number of concurrent active probes;
// a limit of zero will ping every candidate at once.
func (c *Client) ProbeAllHARouters(ctx context.Context, limit int) (*Report, error) {
nm := c.waitForInitialNetMap(ctx)
// When a prefix is routed by multiple nodes, we probe those nodes.
// There is no point to probing a router when it is the only choice.
// These nodes are referred to a High Availability (HA) routers.
var nodes []tailcfg.NodeView
for _, rs := range c.RoutersByPrefix() {
if len(rs) <= 1 {
continue
}
nodes = append(nodes, rs...) // Note: this introduces duplicates.
}
// Sort by Node.ID and deduplicate to avoid double-probing.
slices.SortFunc(nodes, func(a, b tailcfg.NodeView) int {
return cmp.Compare(a.ID(), b.ID())
})
slices.CompactFunc(nodes, func(a, b tailcfg.NodeView) bool {
return a.ID() == b.ID()
})
// To prevent swarming, each node should probe in a different order.
seed := uint64(nm.SelfNode.ID())
rnd := rand.New(rand.NewPCG(seed, seed))
rnd.Shuffle(len(nodes), func(i, j int) {
nodes[i], nodes[j] = nodes[j], nodes[i]
})
return c.Probe(ctx, slices.Values(nodes), limit)
}
// WaitForInitialNetMap returns the current [netmap.NetworkMap], if present.
// If the network map is missing because the client just started,
// this function will wait for the control plane to send it before returning.
func (c *Client) waitForInitialNetMap(ctx context.Context) *netmap.NetworkMap {
nm := c.b.NetMap()
if nm != nil {
return nm
}
// Wait for the initial NetworkMap to arrive:
c.b.WatchNotifications(ctx, ipn.NotifyInitialNetMap, nil, func(n *ipn.Notify) (keepGoing bool) {
nm = n.NetMap
return nm == nil // Keep going until nm contains a network map.
})
return nm
}
// Routers returns a sequence of nodes that are routers, which will advertise
// more [tailcfg.Node.AllowedIPs] than the nodes own [tailcfg.Node.Addresses].
func (c *Client) Routers() iter.Seq[tailcfg.NodeView] {
return func(yield func(tailcfg.NodeView) bool) {
for _, n := range c.b.Peers() {
AllowedIPs:
for _, pfx := range n.AllowedIPs().All() {
// Routers never forward their own local addresses.
for _, addr := range n.Addresses().All() {
if pfx == addr {
continue AllowedIPs
}
}
if !yield(n) {
return
}
}
}
}
}
// RoutersByPrefix returns a map of nodes that route for a particular subnet.
// Nodes that route for /0 prefixes are exit nodes, their subnet is the Internet.
func (c *Client) RoutersByPrefix() map[netip.Prefix][]tailcfg.NodeView {
var routers map[netip.Prefix][]tailcfg.NodeView
for _, n := range c.b.Peers() {
for _, pfx := range n.AllowedIPs().All() {
mak.Set(&routers, pfx, append(routers[pfx], n))
}
continue
}
return routers
}
// Routes returns a slice of subnets that the given node will route.
// If the node is an exit node, the result will contain at least one /0 prefix.
// If the node is a subnet router, the result will contain a smaller prefix.
// The result omits any prefix that is one of the nodes local addresses.
func routes(n tailcfg.NodeView) []netip.Prefix {
var routes []netip.Prefix
AllowedIPs:
for _, pfx := range n.AllowedIPs().All() {
// Routers never forward their own local addresses.
for _, addr := range n.Addresses().All() {
if pfx == addr {
continue AllowedIPs
}
}
routes = append(routes, pfx)
}
return routes
}
// EarlyExit is used to exit early out of a [ipnext.NodeBackend.AppendMatchingPeers] loop.
// It is a sentinel type used by panic and recover.
type earlyExit struct{}