net/routecheck: introduce new package for checking peer reachability

The routecheck package parallels the netcheck package, where the
former checks routes and routers while the latter checks networks.
Like netcheck, it compiles reports for other systems to consume.

Historically, the client has never known whether a peer is actually
reachable. Most of the time this doesn’t matter, since the client will
want to establish a WireGuard tunnel to any given destination.
However, if the client needs to choose between two or more nodes,
then it should try to choose a node that it can reach.

Suggested exit nodes are one such example, where the client filters
out any nodes that aren’t connected to the control plane. Sometimes an
exit node will get disconnected from the control plane: when the
network between the two is unreliable or when the exit node is too
busy to keep its control connection alive. In these cases, Control
disables the Node.Online flag for the exit node and broadcasts this
across the tailnet. Arguably, the client should never have relied on
this flag, since it only makes sense in the admin console.

This patch implements an initial routecheck client that can probe
every node that your client knows about. You should not ping scan your
visible tailnet, this method is for debugging only.

This patch also introduces a new OnNetMapToggle hook, which fires when
the netmap transitions from nil to non-nil, or vice versa. This
happens either when the client receives its first MapResponse after
connecting to the control plane, or when it clears the netmap while it
is disconnecting. Routecheck uses this to wait for a valid netmap
so it knows which peers to probe.

Updates #17366
Updates tailscale/corp#33033

Signed-off-by: Simon Law <sfllaw@tailscale.com>
This commit is contained in:
Simon Law
2026-04-01 11:06:18 -07:00
parent 988615dbad
commit 881ea46bf4
12 changed files with 1157 additions and 3 deletions

View File

@@ -329,6 +329,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
tailscale.com/ipn/ipnstate from tailscale.com/client/local+
tailscale.com/ipn/localapi from tailscale.com/ipn/ipnserver+
tailscale.com/ipn/policy from tailscale.com/feature/portlist
tailscale.com/ipn/routecheck from tailscale.com/feature/routecheck
tailscale.com/ipn/store from tailscale.com/cmd/tailscaled+
L tailscale.com/ipn/store/awsstore from tailscale.com/feature/condregister
L tailscale.com/ipn/store/kubestore from tailscale.com/feature/condregister
@@ -378,7 +379,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
tailscale.com/net/stun from tailscale.com/ipn/localapi+
tailscale.com/net/tlsdial from tailscale.com/control/controlclient+
tailscale.com/net/tlsdial/blockblame from tailscale.com/net/tlsdial
tailscale.com/net/traffic from tailscale.com/ipn/ipnlocal
tailscale.com/net/traffic from tailscale.com/ipn/ipnlocal+
tailscale.com/net/tsaddr from tailscale.com/client/web+
tailscale.com/net/tsdial from tailscale.com/cmd/tailscaled+
💣 tailscale.com/net/tshttpproxy from tailscale.com/feature/useproxy

18
feature/routecheck/ipn.go Normal file
View File

@@ -0,0 +1,18 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package routecheck
import (
"tailscale.com/ipn/ipnext"
"tailscale.com/ipn/routecheck"
)
// NodeBackender is a shim between [ipnext.Host] and [routecheck.NodeBackender].
type nodeBackender struct{ ipnext.Host }
var _ routecheck.NodeBackender = nodeBackender{}
func (nb nodeBackender) NodeBackend() routecheck.NodeBackend {
return nb.Host.NodeBackend()
}

View File

@@ -12,6 +12,75 @@
// establish a WireGuard session.
package routecheck
import (
"fmt"
"tailscale.com/ipn/ipnext"
"tailscale.com/ipn/routecheck"
"tailscale.com/types/logger"
"tailscale.com/types/netmap"
)
// FeatureName is the name of the feature implemented by this package.
// It is also the [extension] name and the log prefix.
const featureName = "routecheck"
func init() {
// TODO(sfllaw): Initialize the new routecheck package.
ipnext.RegisterExtension(featureName, func(logf logger.Logf, b ipnext.SafeBackend) (ipnext.Extension, error) {
return &Extension{
logf: logger.WithPrefix(logf, featureName+": "),
backend: b,
}, nil
})
}
// Extension implements the [ipnext.Extension] interface.
type Extension struct {
Client *routecheck.Client
logf logger.Logf
backend ipnext.SafeBackend
nb nodeBackender
nm routecheck.NetMapper
}
var _ ipnext.Extension = new(Extension)
// Name implements the [ipnext.Extension.Name] interface method.
func (e *Extension) Name() string {
return featureName
}
// Init implements the [ipnext.Extension.Init] interface method.
func (e *Extension) Init(h ipnext.Host) error {
e.nb = nodeBackender{h}
nm, ok := e.backend.(routecheck.NetMapper)
if !ok {
return fmt.Errorf("backend %T does not implement routecheck.NetMapWaiter", e.backend)
}
e.nm = nm
pinger := e.backend.Sys().Engine.Get()
c, err := routecheck.NewClient(e.logf, e.nb, e.nm, pinger)
if err != nil {
return err
}
e.Client = c
h.Hooks().OnNetMapToggle.Add(e.onNetMapToggle)
return nil
}
// Shutdown implements the [ipnext.Extension.Shutdown] interface method.
func (e *Extension) Shutdown() error {
return nil
}
func (e *Extension) onNetMapToggle(nm *netmap.NetworkMap) {
if nm := e.nm.NetMapNoPeers(); nm != nil {
e.Client.NetMapAvailable(nm)
}
}

View File

@@ -22,6 +22,7 @@
"tailscale.com/types/key"
"tailscale.com/types/logger"
"tailscale.com/types/mapx"
"tailscale.com/types/netmap"
"tailscale.com/types/views"
"tailscale.com/wgengine/filter"
)
@@ -375,6 +376,12 @@ type Hooks struct {
// is created. It is called with the LocalBackend locked.
NewControlClient feature.Hooks[NewControlClientCallback]
// OnNetMapToggle is called (with LocalBackend.mu held) when the network map
// is toggled from nil to non-nil, or non-nil to nil. This usually happens
// when the client connects to the control plane and receives the initial MapResponse,
// or when the client disconnects and the network map is cleared.
OnNetMapToggle feature.Hooks[func(*netmap.NetworkMap)]
// OnSelfChange is called (with LocalBackend.mu held) when the self node
// changes, including changing to nothing (an invalid view).
OnSelfChange feature.Hooks[func(tailcfg.NodeView)]
@@ -465,10 +472,16 @@ type FilterHooks struct {
//
// It is not a snapshot in time but is locked to a particular node.
type NodeBackend interface {
// Self returns the current node.
Self() tailcfg.NodeView
// AppendMatchingPeers appends all peers that match the predicate
// to the base slice and returns it.
AppendMatchingPeers(base []tailcfg.NodeView, pred func(tailcfg.NodeView) bool) []tailcfg.NodeView
// Peers returns all the current peers.
Peers() []tailcfg.NodeView
// PeerCaps returns the capabilities that src has to this node.
PeerCaps(src netip.Addr) tailcfg.PeerCapMap

View File

@@ -6839,7 +6839,8 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
}()
}
oldSelf := b.currentNode().NetMap().SelfNodeOrZero()
oldNetMap := b.currentNode().NetMap()
oldSelf := oldNetMap.SelfNodeOrZero()
b.dialer.SetNetMap(nm)
if ns, ok := b.sys.Netstack.GetOK(); ok {
@@ -6918,6 +6919,12 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
}
if oldNetMap != nm && (oldNetMap == nil || nm == nil) {
for _, f := range b.extHost.Hooks().OnNetMapToggle {
f(nm)
}
}
if !oldSelf.Equal(nm.SelfNodeOrZero()) {
for _, f := range b.extHost.Hooks().OnSelfChange {
f(nm.SelfNode)

View File

@@ -161,6 +161,7 @@ func (nb *nodeBackend) Context() context.Context {
return nb.ctx
}
// Self returns the current node.
func (nb *nodeBackend) Self() tailcfg.NodeView {
nb.mu.Lock()
defer nb.mu.Unlock()

32
ipn/routecheck/log.go Normal file
View File

@@ -0,0 +1,32 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package routecheck
import (
"log"
"tailscale.com/envknob"
)
// Debugging tweakable.
var debugRoutecheck = envknob.RegisterBool("TS_DEBUG_ROUTECHECK")
// Logf calls [Client.Logf] to print to a logger.
// Arguments are handled in the manner of fmt.Printf.
func (c *Client) logf(format string, a ...any) {
if c.Logf != nil {
c.Logf(format, a...)
} else {
log.Printf(format, a...)
}
}
// Vlogf calls [Client.Logf] to print to a logger, only when in debug mode,
// which is when the TS_DEBUG_ROUTECHECK environment variable is set.
// Arguments are handled in the manner of fmt.Printf.
func (c *Client) vlogf(format string, a ...any) {
if c.Verbose || debugRoutecheck() {
c.logf(format, a...)
}
}

275
ipn/routecheck/probe.go Normal file
View File

@@ -0,0 +1,275 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package routecheck
import (
"cmp"
"context"
"iter"
"net/netip"
"slices"
"time"
"golang.org/x/sync/errgroup"
"tailscale.com/ipn/ipnstate"
"tailscale.com/net/traffic"
"tailscale.com/syncs"
"tailscale.com/tailcfg"
"tailscale.com/util/clientmetric"
"tailscale.com/util/mak"
)
var (
metricPing = clientmetric.NewCounter("routecheck_ping")
metricPingError = clientmetric.NewCounter("routecheck_ping_error")
metricPingReachable = clientmetric.NewCounter("routecheck_ping_reachable")
metricPingTimeout = clientmetric.NewCounter("routecheck_ping_timeout")
metricProbe = clientmetric.NewCounter("routecheck_probe")
)
// DefaultTimeout is the default time allowed for a response before a peer is considered unreachable.
const DefaultTimeout = 4 * time.Second
type probed struct {
id tailcfg.NodeID
name string
addr netip.Addr
routes []netip.Prefix
}
func (c *Client) probe(ctx context.Context, nodes iter.Seq[probed], limit int, timeout time.Duration) (*Report, error) {
metricProbe.Add(1)
g, ctx := errgroup.WithContext(ctx)
if limit > 0 {
g.SetLimit(limit)
}
var mu syncs.Mutex
r := &Report{}
// TODO(sfllaw): Since the nodes are sorted by priority,
// where earlier nodes have high traffic-steering scores,
// it should be possible to deprioritize or skip probes
// if there are already enough responses for a particular resource.
// This optimization has not been implemented yet, so all nodes are probed.
for n := range nodes {
g.Go(func() error {
metricPing.Add(1)
// TODO(sfllaw): Why did we choose Disco ping instead of TSMP ping?
// After all, a TSMP ping proves that the peer Tailscale node is there
// and that both nodes know each others WireGuard keys,
// while a Disco ping only proves that the peer can be found using DERP.
// However, TSMP is wrapped in a long-lived WireGuard connection,
// which is too expensive when generating a reachability report.
//
// Since WireGuard connections are established using a single round-trip,
// there is no existing way to confirm that a WireGuard connection
// can be established without burdening the peer with lingering state.
// WireGuard could be extended with a special `handshake_initiation`
// that only verifies that a connection could be established,
// requesting this with a sentinel in `handshake_initiation.mac2`.
// The peer would send a valid but stateless `handshake_response`,
// using a random ephemeral_private key and not record any state.
// See https://www.wireguard.com/protocol/ and tailscale/tailscale#19670.
pong, err := c.ping(ctx, n.addr, tailcfg.PingDisco, timeout)
if err != nil {
// Returning an error would cancel the errgroup.
if err != context.DeadlineExceeded {
c.vlogf("ping %s (%s): error: %v", n.addr, n.id, err)
metricPingError.Add(1)
}
// Ping timed out, so assume that the node is unreachable.
c.vlogf("ping %s (%s): timed out", n.addr, n.id)
metricPingTimeout.Add(1)
return nil
} else if pong == nil {
c.vlogf("ping %s (%s): error: no response", n.addr, n.id)
metricPingError.Add(1)
return nil
} else {
c.vlogf("ping %s (%s): result: %f ms (err: %v)", n.addr, n.id, pong.LatencySeconds*1000, pong.Err)
metricPingReachable.Add(1)
}
mu.Lock()
defer mu.Unlock()
if _, ok := r.Reachable[n.id]; !ok {
mak.Set(&r.Reachable, n.id, Node{
ID: n.id,
Name: n.name,
Addr: n.addr,
Routes: n.routes,
})
}
return nil
})
}
g.Wait()
r.Done = time.Now()
return r, nil
}
// Probe actively probes the sequence of nodes and returns a reachability [Report].
// If limit is positive, it limits the number of concurrent active probes;
// a limit of zero will ping every node at once.
// A peer is considered unreachable if it doesnt respond within the timeout.
//
// This function will probe nodes in order, so better candidates should be
// sorted earlier in the sequence. This function may use ordering to skip some probes
// if it has discovered enough reachable peers.
//
// This function tries both the IPv4 and IPv6 addresses.
func (c *Client) Probe(ctx context.Context, nodes iter.Seq[tailcfg.NodeView], limit int, timeout time.Duration) (*Report, error) {
is4, is6 := supportsIPVersions(c.nb.NodeBackend().Self())
if is4 == nil && is6 == nil {
return nil, nil
}
addrFor := addrPicker(is4, is6)
// Assumed nodes are ones that we assume are reachable,
// because we cant probe nodes that dont understand Disco pings.
var assumed []tailcfg.NodeView
var dsts iter.Seq[probed] = func(yield func(probed) bool) {
for n := range nodes {
if n.IsWireGuardOnly() {
assumed = append(assumed, n)
continue // Probably cant speak Disco or DERP.
}
// Probe one of the tailnet addresses.
addr := addrFor(n)
if !addr.IsValid() {
continue // No valid addresses.
}
if !yield(probed{
id: n.ID(),
name: n.Name(),
addr: addr,
routes: routes(n),
}) {
return
}
}
}
r, err := c.probe(ctx, dsts, limit, timeout)
if err != nil {
return nil, err
}
// Mix in the assumed nodes.
for _, n := range assumed {
addr := addrFor(n)
if !addr.IsValid() {
continue // No valid addresses.
}
id := n.ID()
if _, ok := r.Reachable[id]; !ok {
mak.Set(&r.Reachable, id, Node{
ID: id,
Name: n.Name(),
Addr: addr,
Routes: routes(n),
})
}
}
return r, nil
}
// ProbeAllHARouters actively probes all High Availability routers in parallel
// and returns a [Report] that identifies which of these routers are reachable.
// If limit is positive, it limits the number of concurrent active probes;
// a limit of zero will ping every candidate at once.
// A peer is considered unreachable if it doesnt respond within the timeout.
func (c *Client) ProbeAllHARouters(ctx context.Context, limit int, timeout time.Duration) (*Report, error) {
nm, err := c.waitForNetMap(ctx)
if err != nil {
return nil, err
}
// When a prefix is routed by multiple nodes, we probe those nodes.
// There is no point to probing a router when it is the only choice.
// These nodes are referred to a High Availability (HA) routers.
var nodes []tailcfg.NodeView
for _, rs := range c.RoutersByPrefix() {
if len(rs) <= 1 {
continue
}
nodes = append(nodes, rs...) // Note: this introduces duplicates.
}
// Sort by Node.ID and deduplicate to avoid double-probing.
slices.SortFunc(nodes, func(a, b tailcfg.NodeView) int {
return cmp.Compare(a.ID(), b.ID())
})
nodes = slices.CompactFunc(nodes, func(a, b tailcfg.NodeView) bool {
return a.ID() == b.ID()
})
// Each node should probe starting with the highest scoring node.
// We use rendezvous hashing to break ties in a consistent manner
// while still preventing swarming.
ss := traffic.ScoresFor(nm.SelfNode.ID(), nodes)
ss.SortNodes(nodes)
return c.Probe(ctx, slices.Values(nodes), limit, timeout)
}
// Ping returns the result of a ping to the peer handling the given IP.
// It returns a [context.DeadlineExceeded] error if the peer doesnt respond within the timeout.
func (c *Client) ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, timeout time.Duration) (*ipnstate.PingResult, error) {
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
ch := make(chan *ipnstate.PingResult, 1)
c.pinger.Ping(ip, pingType, 0, func(pr *ipnstate.PingResult) {
select {
case ch <- pr:
default:
}
})
select {
case pr := <-ch:
return pr, nil
case <-ctx.Done():
return nil, ctx.Err()
}
}
func supportsIPVersions(n tailcfg.NodeView) (is4, is6 func(netip.Addr) bool) {
if !n.Valid() {
return nil, nil
}
for _, ip := range n.Addresses().All() {
addr := ip.Addr()
if addr.Is4() {
is4 = func(addr netip.Addr) bool { return addr.Is4() }
} else if addr.Is6() {
is6 = func(addr netip.Addr) bool { return addr.Is6() }
}
if is4 != nil && is6 != nil {
break
}
}
return is4, is6
}
func addrPicker(is4, is6 func(netip.Addr) bool) func(n tailcfg.NodeView) netip.Addr {
return func(n tailcfg.NodeView) netip.Addr {
var zero netip.Addr
for _, ip := range n.Addresses().All() {
// Find a compatible IP address.
addr := ip.Addr()
if is4 != nil && is4(addr) {
return addr
}
if is6 != nil && is6(addr) {
return addr
}
}
return zero
}
}

61
ipn/routecheck/report.go Normal file
View File

@@ -0,0 +1,61 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package routecheck
import (
"context"
"net/netip"
"time"
"tailscale.com/tailcfg"
"tailscale.com/util/clientmetric"
)
var (
metricReport = clientmetric.NewCounter("routecheck_report")
)
// Report returns the latest reachability report.
// Returns nil if a report isnt available, which happens during initialization.
func (c *Client) Report() *Report {
metricReport.Add(1)
nm := c.nm.NetMapNoPeers()
if nm == nil {
return nil // The report wasnt available.
}
// TODO(sfllaw): Return the latest snapshot produced by background probing.
r, err := c.ProbeAllHARouters(context.TODO(), 5, DefaultTimeout)
if err != nil {
c.logf("reachability report error: %v", err)
}
return r
}
// Report contains the result of a single routecheck.
type Report struct {
// Done is the time when the report was finished.
Done time.Time
// Reachable is the set of nodes that were reachable from the current host
// when this report was compiled. Missing nodes may or may not be reachable.
Reachable map[tailcfg.NodeID]Node
}
// Node represents a node in the reachability report.
type Node struct {
ID tailcfg.NodeID
// Name is the FQDN of the node.
// It is also the MagicDNS name for the node.
// It has a trailing dot.
// e.g. "host.tail-scale.ts.net."
Name string
// Addr is the IP address that was probed.
Addr netip.Addr
// Routes are the subnets that the node will route.
Routes []netip.Prefix
}

View File

@@ -0,0 +1,139 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
// Package routecheck performs status checks for routes from the current host.
package routecheck
import (
"context"
"errors"
"net/netip"
"sync"
"tailscale.com/ipn/ipnstate"
"tailscale.com/tailcfg"
"tailscale.com/types/logger"
"tailscale.com/types/netmap"
)
// Client generates Reports describing the result of both passive and active
// reachability probing.
type Client struct {
// Verbose enables verbose logging.
Verbose bool
// Logf optionally specifies where to log to.
// If nil, log.Printf is used.
Logf logger.Logf
// These elements are read-only after initialization.
nb NodeBackender
nm NetMapper
pinger Pinger
// NetMapAvailable is raised when the first network map is received
// after connecting to the control plane.
netMapAvailable sync.Cond
}
// NetMapper is the interface that returns the current [netmap.NetworkMap].
type NetMapper interface {
// NetMapNoPeers returns the latest cached network map received from
// controlclient WITHOUT a freshly-built Peers slice.
//
// On a tailnet with frequent peer churn the cached netmap's Peers slice
// can be stale relative to the live per-node-backend peers map; non-Peers
// fields (SelfNode, DNS, PacketFilter, capabilities, ...) are always
// current. Use this for any caller that does not need to iterate Peers,
// since it's O(1) regardless of tailnet size.
//
// Returns nil if no network map has been received yet.
NetMapNoPeers() *netmap.NetworkMap
// NetMapWithPeers returns the latest network map with the Peers slice
// populated.
//
// Currently this is the same as [LocalBackend.NetMapNoPeers]: the cached
// netmap's Peers slice may be stale relative to the live per-node-backend
// peers map. A follow-up change will switch this method to return a
// freshly-built netmap with up-to-date Peers, at O(N) cost per call.
// Callers that genuinely need the up-to-date peer set should use this
// method (and document why) so the upcoming change reaches them.
//
// Returns nil if no network map has been received yet.
NetMapWithPeers() *netmap.NetworkMap
}
// NodeBackender is the interface that returns the current [NodeBackend].
type NodeBackender interface {
NodeBackend() NodeBackend
}
// NodeBackend is an interface to query the current node and its peers.
//
// It is not a snapshot in time but is locked to a particular node.
type NodeBackend interface {
// Self returns the current node.
Self() tailcfg.NodeView
// Peers returns all the current peers.
Peers() []tailcfg.NodeView
}
// Pinger is the interface that wraps the [tailscale.com/ipn/ipnlocal.LocalBackend.Ping] method.
type Pinger interface {
Ping(ip netip.Addr, pingType tailcfg.PingType, size int, cb func(*ipnstate.PingResult))
}
// NewClient returns a client that probes its peers using this LocalBackend.
func NewClient(logf logger.Logf, nb NodeBackender, nm NetMapper, pinger Pinger) (*Client, error) {
if nb == nil {
return nil, errors.New("NodeBackender must be set")
}
if nm == nil {
return nil, errors.New("NetMapper must be set")
}
if pinger == nil {
return nil, errors.New("Pinger must be set")
}
c := &Client{
Logf: logf,
nb: nb,
nm: nm,
pinger: pinger,
}
c.netMapAvailable.L = new(sync.Mutex)
return c, nil
}
func (c *Client) NetMapAvailable(nm *netmap.NetworkMap) {
if nm == nil {
return // client disconnected
}
c.netMapAvailable.Broadcast()
}
func (c *Client) waitForNetMap(ctx context.Context) (*netmap.NetworkMap, error) {
cond := &c.netMapAvailable
stopf := context.AfterFunc(ctx, func() {
// Lock cond to ensure that Broadcast is called after the Wait below.
cond.L.Lock()
defer cond.L.Unlock()
cond.Broadcast()
})
defer stopf()
cond.L.Lock()
defer cond.L.Unlock()
for {
nm := c.nm.NetMapNoPeers()
if nm != nil {
return nm, nil
}
cond.Wait()
if err := ctx.Err(); err != nil {
return nil, err
}
}
}

View File

@@ -0,0 +1,487 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package routecheck_test
import (
"fmt"
"maps"
"net/netip"
"slices"
"testing"
"testing/synctest"
"time"
gcmp "github.com/google/go-cmp/cmp"
gcmpopts "github.com/google/go-cmp/cmp/cmpopts"
"tailscale.com/ipn/ipnstate"
"tailscale.com/ipn/routecheck"
"tailscale.com/net/tsaddr"
"tailscale.com/tailcfg"
"tailscale.com/types/netmap"
"tailscale.com/util/mak"
"tailscale.com/util/set"
)
func TestReport(t *testing.T) {
for _, tc := range []struct {
name string
init bool // true before the netmap has been loaded
peers []tailcfg.NodeView
gone []tailcfg.NodeID // cannot ping these nodes
want []tailcfg.NodeID // Report.Reachable nodes
}{
{
name: "before-netmap",
init: true,
want: nil,
},
{
name: "no-peers",
peers: []tailcfg.NodeView{},
want: []tailcfg.NodeID{},
},
{
name: "no-routers",
peers: []tailcfg.NodeView{
makeNode(1, withName("peer1")),
},
want: []tailcfg.NodeID{},
},
{
name: "no-choice",
peers: []tailcfg.NodeView{
makeNode(11, withName("exit11"), withExitRoutes()),
makeNode(21, withName("subnet21"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))),
},
want: []tailcfg.NodeID{},
},
{
name: "all-good",
peers: []tailcfg.NodeView{
makeNode(11, withName("exit11"), withExitRoutes()),
makeNode(12, withName("exit12"), withExitRoutes()),
makeNode(21, withName("subnet21"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))),
makeNode(22, withName("subnet22"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))),
},
want: []tailcfg.NodeID{11, 12, 21, 22},
},
{
name: "none-good",
peers: []tailcfg.NodeView{
makeNode(11, withName("exit11"), withExitRoutes()),
makeNode(12, withName("exit12"), withExitRoutes()),
makeNode(21, withName("subnet21"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))),
makeNode(22, withName("subnet22"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))),
},
gone: []tailcfg.NodeID{11, 12, 21, 22},
want: []tailcfg.NodeID{},
},
{
name: "some-good",
peers: []tailcfg.NodeView{
makeNode(11, withName("exit11"), withExitRoutes()),
makeNode(12, withName("exit12"), withExitRoutes()),
makeNode(21, withName("subnet21"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))),
makeNode(22, withName("subnet22"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))),
},
gone: []tailcfg.NodeID{11, 22},
want: []tailcfg.NodeID{12, 21},
},
} {
makeDB := func(nodes []tailcfg.NodeView) map[tailcfg.NodeID]routecheck.Node {
if len(nodes) == 0 {
return nil
}
db := make(map[tailcfg.NodeID]routecheck.Node)
for _, n := range tc.peers {
db[n.ID()] = routecheck.Node{
ID: n.ID(),
Name: n.Name(),
Addr: n.Addresses().At(0).Addr(),
Routes: n.AllowedIPs().AsSlice()[2:],
}
}
return db
}
cmpDiff := func(want, got any) string {
return gcmp.Diff(want, got,
gcmpopts.EquateComparable(netip.Addr{}, netip.Prefix{}))
}
t.Run(tc.name, func(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
// The backend is initialized without a NetMap.
b := newStubBackend(tailcfg.NodeView{}, nil, withGone(tc.gone...))
if !tc.init {
self := makeNode(99, withName("self"))
b = newStubBackend(self, tc.peers, withGone(tc.gone...))
}
c, err := routecheck.NewClient(t.Logf, b, b, b)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
got := c.Report()
now := time.Now() // synctest will freeze time.
var want *routecheck.Report
peers := makeDB(tc.peers)
if !tc.init {
want = &routecheck.Report{
Done: now,
}
for _, nid := range tc.want {
mak.Set(&want.Reachable, nid, peers[nid])
}
}
if diff := cmpDiff(want, got); diff != "" {
t.Errorf("-want +got:\n%s", diff)
}
})
})
}
}
func TestRoutersByPrefix(t *testing.T) {
type routersByPrefix map[netip.Prefix][]tailcfg.NodeID
simplify := func(rs routecheck.RoutersByPrefix) routersByPrefix {
out := make(routersByPrefix, len(rs))
for p, ns := range rs {
for _, n := range ns {
out[p] = append(out[p], n.ID())
}
slices.Sort(out[p])
}
return out
}
for _, tc := range []struct {
name string
peers []tailcfg.NodeView
want routersByPrefix
}{
{
name: "no-peers",
peers: []tailcfg.NodeView{},
want: routersByPrefix{},
},
{
name: "no-routers",
peers: []tailcfg.NodeView{
makeNode(1, withName("peer1")),
},
want: routersByPrefix{},
},
{
name: "one-exit-node",
peers: []tailcfg.NodeView{
makeNode(1, withName("peer1")),
makeNode(11, withName("exit11"), withExitRoutes()),
},
want: routersByPrefix{
netip.MustParsePrefix("0.0.0.0/0"): {11},
netip.MustParsePrefix("::/0"): {11},
},
},
{
name: "overlapping-exit-nodes",
peers: []tailcfg.NodeView{
makeNode(1, withName("peer1")),
makeNode(11, withName("exit11"), withExitRoutes()),
makeNode(12, withName("exit12"), withExitRoutes()),
},
want: routersByPrefix{
netip.MustParsePrefix("0.0.0.0/0"): {11, 12},
netip.MustParsePrefix("::/0"): {11, 12},
},
},
{
name: "one-subnet-router",
peers: []tailcfg.NodeView{
makeNode(1, withName("peer1")),
makeNode(21, withName("subnet21"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))),
},
want: routersByPrefix{
netip.MustParsePrefix("192.168.1.0/24"): {21},
netip.MustParsePrefix("2002:c000:0100::/48"): {21},
},
},
{
name: "overlapping-subnet-routers",
peers: []tailcfg.NodeView{
makeNode(1, withName("peer1")),
makeNode(21, withName("subnet21"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))),
makeNode(22, withName("subnet22"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))),
},
want: routersByPrefix{
netip.MustParsePrefix("192.168.1.0/24"): {21, 22},
netip.MustParsePrefix("2002:c000:0100::/48"): {21, 22},
},
},
{
name: "disjoint-subnet-routers",
peers: []tailcfg.NodeView{
makeNode(1, withName("peer1")),
makeNode(21, withName("subnet21"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))),
makeNode(22, withName("subnet22"),
withRoutes(netip.MustParsePrefix("192.168.2.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0200::/48"))),
},
want: routersByPrefix{
netip.MustParsePrefix("192.168.1.0/24"): {21},
netip.MustParsePrefix("2002:c000:0100::/48"): {21},
netip.MustParsePrefix("192.168.2.0/24"): {22},
netip.MustParsePrefix("2002:c000:0200::/48"): {22},
},
},
{
name: "multiple-routes",
peers: []tailcfg.NodeView{
makeNode(1, withName("peer1")),
makeNode(21, withName("subnet21"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48")),
withRoutes(netip.MustParsePrefix("192.168.2.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0200::/48"))),
makeNode(22, withName("subnet22"),
withRoutes(netip.MustParsePrefix("192.168.2.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0200::/48")),
withRoutes(netip.MustParsePrefix("192.168.3.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0300::/48"))),
makeNode(23, withName("subnet23"),
withRoutes(netip.MustParsePrefix("192.168.3.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0300::/48")),
withRoutes(netip.MustParsePrefix("192.168.4.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0400::/48"))),
},
want: routersByPrefix{
netip.MustParsePrefix("192.168.1.0/24"): {21},
netip.MustParsePrefix("2002:c000:0100::/48"): {21},
netip.MustParsePrefix("192.168.2.0/24"): {21, 22},
netip.MustParsePrefix("2002:c000:0200::/48"): {21, 22},
netip.MustParsePrefix("192.168.3.0/24"): {22, 23},
netip.MustParsePrefix("2002:c000:0300::/48"): {22, 23},
netip.MustParsePrefix("192.168.4.0/24"): {23},
netip.MustParsePrefix("2002:c000:0400::/48"): {23},
},
},
{
name: "both-exit-nodes-and-routers",
peers: []tailcfg.NodeView{
makeNode(1, withName("peer1")),
makeNode(11, withName("exit11"), withExitRoutes()),
makeNode(12, withName("exit12"), withExitRoutes()),
makeNode(21, withName("subnet21"),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48")),
withRoutes(netip.MustParsePrefix("192.168.2.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0200::/48"))),
makeNode(22, withName("subnet22"),
withRoutes(netip.MustParsePrefix("192.168.2.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0200::/48")),
withRoutes(netip.MustParsePrefix("192.168.3.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0300::/48"))),
},
want: routersByPrefix{
netip.MustParsePrefix("0.0.0.0/0"): {11, 12},
netip.MustParsePrefix("::/0"): {11, 12},
netip.MustParsePrefix("192.168.1.0/24"): {21},
netip.MustParsePrefix("2002:c000:0100::/48"): {21},
netip.MustParsePrefix("192.168.2.0/24"): {21, 22},
netip.MustParsePrefix("2002:c000:0200::/48"): {21, 22},
netip.MustParsePrefix("192.168.3.0/24"): {22},
netip.MustParsePrefix("2002:c000:0300::/48"): {22},
},
},
{
name: "mixed-nodes",
peers: []tailcfg.NodeView{
makeNode(1, withName("peer1")),
makeNode(31, withName("router31"),
withExitRoutes(),
withRoutes(netip.MustParsePrefix("192.168.1.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0100::/48")),
withRoutes(netip.MustParsePrefix("192.168.2.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0200::/48"))),
makeNode(32, withName("router32"),
withExitRoutes(),
withRoutes(netip.MustParsePrefix("192.168.2.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0200::/48")),
withRoutes(netip.MustParsePrefix("192.168.3.0/24")),
withRoutes(netip.MustParsePrefix("2002:c000:0300::/48"))),
},
want: routersByPrefix{
netip.MustParsePrefix("0.0.0.0/0"): {31, 32},
netip.MustParsePrefix("::/0"): {31, 32},
netip.MustParsePrefix("192.168.1.0/24"): {31},
netip.MustParsePrefix("2002:c000:0100::/48"): {31},
netip.MustParsePrefix("192.168.2.0/24"): {31, 32},
netip.MustParsePrefix("2002:c000:0200::/48"): {31, 32},
netip.MustParsePrefix("192.168.3.0/24"): {32},
netip.MustParsePrefix("2002:c000:0300::/48"): {32},
},
},
} {
t.Run(tc.name, func(t *testing.T) {
self := makeNode(99, withName("self"))
b := newStubBackend(self, tc.peers)
c, err := routecheck.NewClient(t.Logf, b, b, b)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
got := simplify(c.RoutersByPrefix())
if !maps.EqualFunc(got, tc.want, slices.Equal) {
t.Errorf("got %+v, want %+v", got, tc.want)
}
})
}
}
type nodeOptFunc func(*tailcfg.Node)
func makeNode(id tailcfg.NodeID, opts ...nodeOptFunc) tailcfg.NodeView {
addresses := []netip.Prefix{
netip.MustParsePrefix(fmt.Sprintf("192.168.0.%d/32", id)),
netip.MustParsePrefix(fmt.Sprintf("fd7a:115c:a1e0::%d/128", id)),
}
node := &tailcfg.Node{
ID: id,
StableID: tailcfg.StableNodeID(fmt.Sprintf("stable%d", id)),
Name: fmt.Sprintf("node%d", id),
Online: new(true),
MachineAuthorized: true,
HomeDERP: int(id),
Addresses: addresses,
AllowedIPs: addresses,
}
for _, opt := range opts {
opt(node)
}
return node.View()
}
func withExitRoutes() nodeOptFunc {
return withRoutes(tsaddr.ExitRoutes()...)
}
func withName(name string) nodeOptFunc {
return func(n *tailcfg.Node) {
n.Name = name
}
}
func withRoutes(routes ...netip.Prefix) nodeOptFunc {
return func(n *tailcfg.Node) {
n.AllowedIPs = append(n.AllowedIPs, routes...)
}
}
var _ routecheck.NodeBackender = &stubBackend{}
var _ routecheck.NodeBackend = &stubBackend{}
var _ routecheck.NetMapper = &stubBackend{}
var _ routecheck.Pinger = &stubBackend{}
type stubBackend struct {
self tailcfg.NodeView
peers []tailcfg.NodeView
gone set.Set[tailcfg.NodeID]
}
type backendOptFunc func(*stubBackend)
func newStubBackend(self tailcfg.NodeView, peers []tailcfg.NodeView, opts ...backendOptFunc) *stubBackend {
b := &stubBackend{
self: self,
peers: slices.Clone(peers),
}
for _, opt := range opts {
opt(b)
}
return b
}
func (b *stubBackend) NetMapNoPeers() *netmap.NetworkMap {
if !b.self.Valid() {
return nil
}
return &netmap.NetworkMap{
SelfNode: b.self,
Peers: nil, // No peers.
}
}
func (b *stubBackend) NetMapWithPeers() *netmap.NetworkMap {
nm := b.NetMapNoPeers()
if nm != nil {
nm.Peers = b.peers
}
return nm
}
func (nb *stubBackend) NodeBackend() routecheck.NodeBackend {
return nb
}
func (nb *stubBackend) Self() tailcfg.NodeView {
return nb.self
}
func (nb *stubBackend) Peers() []tailcfg.NodeView {
return nb.peers
}
func (b *stubBackend) Ping(ip netip.Addr, pingType tailcfg.PingType, size int, cb func(*ipnstate.PingResult)) {
// Does the IP address match one of the peers addresses?
for _, n := range b.peers {
for _, a := range n.Addresses().All() {
if a.Addr() != ip {
continue
}
if b.gone.Contains(n.ID()) {
continue
}
go cb(&ipnstate.PingResult{
IP: ip.String(),
NodeIP: ip.String(),
NodeName: n.Name(),
LatencySeconds: 0.01,
})
}
}
}
func withGone(gone ...tailcfg.NodeID) backendOptFunc {
return func(b *stubBackend) {
b.gone = set.SetOf(gone)
}
}

51
ipn/routecheck/routes.go Normal file
View File

@@ -0,0 +1,51 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package routecheck
import (
"net/netip"
"tailscale.com/tailcfg"
"tailscale.com/util/mak"
)
// RoutersByPrefix represents a map of nodes grouped by the subnet that they route.
type RoutersByPrefix map[netip.Prefix][]tailcfg.NodeView
// RoutersByPrefix returns a map of nodes grouped by the subnet that they route.
// Nodes that route for /0 prefixes are exit nodes, their subnet is the Internet.
// The result omits any prefix that is one of a nodes local addresses.
//
// Note: Fallback routes are not supported by design. If a subnet prefix
// contained within another more general prefix has no reachable routers,
// traffic is still sent to one of those unreachable routers.
// Routers for the general prefix arent candidates. See tailscale/tailscale#18550.
func (c *Client) RoutersByPrefix() RoutersByPrefix {
var routers RoutersByPrefix
for _, n := range c.nb.NodeBackend().Peers() {
for _, pfx := range routes(n) {
mak.Set(&routers, pfx, append(routers[pfx], n))
}
}
return routers
}
// Routes returns a slice of subnets that the given node will route.
// If the node is an exit node, the result will contain at least one /0 prefix.
// If the node is a subnet router, the result will contain a smaller prefix.
// The result omits any prefix that is one of the nodes local addresses.
func routes(n tailcfg.NodeView) []netip.Prefix {
var routes []netip.Prefix
AllowedIPs:
for _, pfx := range n.AllowedIPs().All() {
// Routers never forward their own local addresses.
for _, addr := range n.Addresses().All() {
if pfx == addr {
continue AllowedIPs
}
}
routes = append(routes, pfx)
}
return routes
}