From 881ea46bf4a7cead6dc0b48a6c89544da949aca2 Mon Sep 17 00:00:00 2001 From: Simon Law Date: Wed, 1 Apr 2026 11:06:18 -0700 Subject: [PATCH] net/routecheck: introduce new package for checking peer reachability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The routecheck package parallels the netcheck package, where the former checks routes and routers while the latter checks networks. Like netcheck, it compiles reports for other systems to consume. Historically, the client has never known whether a peer is actually reachable. Most of the time this doesn’t matter, since the client will want to establish a WireGuard tunnel to any given destination. However, if the client needs to choose between two or more nodes, then it should try to choose a node that it can reach. Suggested exit nodes are one such example, where the client filters out any nodes that aren’t connected to the control plane. Sometimes an exit node will get disconnected from the control plane: when the network between the two is unreliable or when the exit node is too busy to keep its control connection alive. In these cases, Control disables the Node.Online flag for the exit node and broadcasts this across the tailnet. Arguably, the client should never have relied on this flag, since it only makes sense in the admin console. This patch implements an initial routecheck client that can probe every node that your client knows about. You should not ping scan your visible tailnet, this method is for debugging only. This patch also introduces a new OnNetMapToggle hook, which fires when the netmap transitions from nil to non-nil, or vice versa. This happens either when the client receives its first MapResponse after connecting to the control plane, or when it clears the netmap while it is disconnecting. Routecheck uses this to wait for a valid netmap so it knows which peers to probe. Updates #17366 Updates tailscale/corp#33033 Signed-off-by: Simon Law --- cmd/tailscaled/depaware.txt | 3 +- feature/routecheck/ipn.go | 18 ++ feature/routecheck/routecheck.go | 71 ++++- ipn/ipnext/ipnext.go | 13 + ipn/ipnlocal/local.go | 9 +- ipn/ipnlocal/node_backend.go | 1 + ipn/routecheck/log.go | 32 ++ ipn/routecheck/probe.go | 275 +++++++++++++++++ ipn/routecheck/report.go | 61 ++++ ipn/routecheck/routecheck.go | 139 +++++++++ ipn/routecheck/routecheck_test.go | 487 ++++++++++++++++++++++++++++++ ipn/routecheck/routes.go | 51 ++++ 12 files changed, 1157 insertions(+), 3 deletions(-) create mode 100644 feature/routecheck/ipn.go create mode 100644 ipn/routecheck/log.go create mode 100644 ipn/routecheck/probe.go create mode 100644 ipn/routecheck/report.go create mode 100644 ipn/routecheck/routecheck.go create mode 100644 ipn/routecheck/routecheck_test.go create mode 100644 ipn/routecheck/routes.go diff --git a/cmd/tailscaled/depaware.txt b/cmd/tailscaled/depaware.txt index 88c433757..3068c85b1 100644 --- a/cmd/tailscaled/depaware.txt +++ b/cmd/tailscaled/depaware.txt @@ -329,6 +329,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de tailscale.com/ipn/ipnstate from tailscale.com/client/local+ tailscale.com/ipn/localapi from tailscale.com/ipn/ipnserver+ tailscale.com/ipn/policy from tailscale.com/feature/portlist + tailscale.com/ipn/routecheck from tailscale.com/feature/routecheck tailscale.com/ipn/store from tailscale.com/cmd/tailscaled+ L tailscale.com/ipn/store/awsstore from tailscale.com/feature/condregister L tailscale.com/ipn/store/kubestore from tailscale.com/feature/condregister @@ -378,7 +379,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de tailscale.com/net/stun from tailscale.com/ipn/localapi+ tailscale.com/net/tlsdial from tailscale.com/control/controlclient+ tailscale.com/net/tlsdial/blockblame from tailscale.com/net/tlsdial - tailscale.com/net/traffic from tailscale.com/ipn/ipnlocal + tailscale.com/net/traffic from tailscale.com/ipn/ipnlocal+ tailscale.com/net/tsaddr from tailscale.com/client/web+ tailscale.com/net/tsdial from tailscale.com/cmd/tailscaled+ 💣 tailscale.com/net/tshttpproxy from tailscale.com/feature/useproxy diff --git a/feature/routecheck/ipn.go b/feature/routecheck/ipn.go new file mode 100644 index 000000000..4cc62e9a2 --- /dev/null +++ b/feature/routecheck/ipn.go @@ -0,0 +1,18 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +package routecheck + +import ( + "tailscale.com/ipn/ipnext" + "tailscale.com/ipn/routecheck" +) + +// NodeBackender is a shim between [ipnext.Host] and [routecheck.NodeBackender]. +type nodeBackender struct{ ipnext.Host } + +var _ routecheck.NodeBackender = nodeBackender{} + +func (nb nodeBackender) NodeBackend() routecheck.NodeBackend { + return nb.Host.NodeBackend() +} diff --git a/feature/routecheck/routecheck.go b/feature/routecheck/routecheck.go index 055ceb379..fc9bf9bac 100644 --- a/feature/routecheck/routecheck.go +++ b/feature/routecheck/routecheck.go @@ -12,6 +12,75 @@ // establish a WireGuard session. package routecheck +import ( + "fmt" + + "tailscale.com/ipn/ipnext" + "tailscale.com/ipn/routecheck" + "tailscale.com/types/logger" + "tailscale.com/types/netmap" +) + +// FeatureName is the name of the feature implemented by this package. +// It is also the [extension] name and the log prefix. +const featureName = "routecheck" + func init() { - // TODO(sfllaw): Initialize the new routecheck package. + ipnext.RegisterExtension(featureName, func(logf logger.Logf, b ipnext.SafeBackend) (ipnext.Extension, error) { + return &Extension{ + logf: logger.WithPrefix(logf, featureName+": "), + backend: b, + }, nil + }) +} + +// Extension implements the [ipnext.Extension] interface. +type Extension struct { + Client *routecheck.Client + + logf logger.Logf + backend ipnext.SafeBackend + nb nodeBackender + nm routecheck.NetMapper +} + +var _ ipnext.Extension = new(Extension) + +// Name implements the [ipnext.Extension.Name] interface method. +func (e *Extension) Name() string { + return featureName +} + +// Init implements the [ipnext.Extension.Init] interface method. +func (e *Extension) Init(h ipnext.Host) error { + e.nb = nodeBackender{h} + + nm, ok := e.backend.(routecheck.NetMapper) + if !ok { + return fmt.Errorf("backend %T does not implement routecheck.NetMapWaiter", e.backend) + } + e.nm = nm + + pinger := e.backend.Sys().Engine.Get() + + c, err := routecheck.NewClient(e.logf, e.nb, e.nm, pinger) + if err != nil { + return err + } + e.Client = c + + h.Hooks().OnNetMapToggle.Add(e.onNetMapToggle) + + return nil +} + +// Shutdown implements the [ipnext.Extension.Shutdown] interface method. +func (e *Extension) Shutdown() error { + return nil +} + +func (e *Extension) onNetMapToggle(nm *netmap.NetworkMap) { + if nm := e.nm.NetMapNoPeers(); nm != nil { + e.Client.NetMapAvailable(nm) + } } diff --git a/ipn/ipnext/ipnext.go b/ipn/ipnext/ipnext.go index 5ca50498a..1af259aac 100644 --- a/ipn/ipnext/ipnext.go +++ b/ipn/ipnext/ipnext.go @@ -22,6 +22,7 @@ "tailscale.com/types/key" "tailscale.com/types/logger" "tailscale.com/types/mapx" + "tailscale.com/types/netmap" "tailscale.com/types/views" "tailscale.com/wgengine/filter" ) @@ -375,6 +376,12 @@ type Hooks struct { // is created. It is called with the LocalBackend locked. NewControlClient feature.Hooks[NewControlClientCallback] + // OnNetMapToggle is called (with LocalBackend.mu held) when the network map + // is toggled from nil to non-nil, or non-nil to nil. This usually happens + // when the client connects to the control plane and receives the initial MapResponse, + // or when the client disconnects and the network map is cleared. + OnNetMapToggle feature.Hooks[func(*netmap.NetworkMap)] + // OnSelfChange is called (with LocalBackend.mu held) when the self node // changes, including changing to nothing (an invalid view). OnSelfChange feature.Hooks[func(tailcfg.NodeView)] @@ -465,10 +472,16 @@ type FilterHooks struct { // // It is not a snapshot in time but is locked to a particular node. type NodeBackend interface { + // Self returns the current node. + Self() tailcfg.NodeView + // AppendMatchingPeers appends all peers that match the predicate // to the base slice and returns it. AppendMatchingPeers(base []tailcfg.NodeView, pred func(tailcfg.NodeView) bool) []tailcfg.NodeView + // Peers returns all the current peers. + Peers() []tailcfg.NodeView + // PeerCaps returns the capabilities that src has to this node. PeerCaps(src netip.Addr) tailcfg.PeerCapMap diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 9b9ea58fb..d87e696f2 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -6839,7 +6839,8 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { }() } - oldSelf := b.currentNode().NetMap().SelfNodeOrZero() + oldNetMap := b.currentNode().NetMap() + oldSelf := oldNetMap.SelfNodeOrZero() b.dialer.SetNetMap(nm) if ns, ok := b.sys.Netstack.GetOK(); ok { @@ -6918,6 +6919,12 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { } + if oldNetMap != nm && (oldNetMap == nil || nm == nil) { + for _, f := range b.extHost.Hooks().OnNetMapToggle { + f(nm) + } + } + if !oldSelf.Equal(nm.SelfNodeOrZero()) { for _, f := range b.extHost.Hooks().OnSelfChange { f(nm.SelfNode) diff --git a/ipn/ipnlocal/node_backend.go b/ipn/ipnlocal/node_backend.go index 3c21ff2a8..087d9df3f 100644 --- a/ipn/ipnlocal/node_backend.go +++ b/ipn/ipnlocal/node_backend.go @@ -161,6 +161,7 @@ func (nb *nodeBackend) Context() context.Context { return nb.ctx } +// Self returns the current node. func (nb *nodeBackend) Self() tailcfg.NodeView { nb.mu.Lock() defer nb.mu.Unlock() diff --git a/ipn/routecheck/log.go b/ipn/routecheck/log.go new file mode 100644 index 000000000..0a92cefe9 --- /dev/null +++ b/ipn/routecheck/log.go @@ -0,0 +1,32 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +package routecheck + +import ( + "log" + + "tailscale.com/envknob" +) + +// Debugging tweakable. +var debugRoutecheck = envknob.RegisterBool("TS_DEBUG_ROUTECHECK") + +// Logf calls [Client.Logf] to print to a logger. +// Arguments are handled in the manner of fmt.Printf. +func (c *Client) logf(format string, a ...any) { + if c.Logf != nil { + c.Logf(format, a...) + } else { + log.Printf(format, a...) + } +} + +// Vlogf calls [Client.Logf] to print to a logger, only when in debug mode, +// which is when the TS_DEBUG_ROUTECHECK environment variable is set. +// Arguments are handled in the manner of fmt.Printf. +func (c *Client) vlogf(format string, a ...any) { + if c.Verbose || debugRoutecheck() { + c.logf(format, a...) + } +} diff --git a/ipn/routecheck/probe.go b/ipn/routecheck/probe.go new file mode 100644 index 000000000..1fb577928 --- /dev/null +++ b/ipn/routecheck/probe.go @@ -0,0 +1,275 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +package routecheck + +import ( + "cmp" + "context" + "iter" + "net/netip" + "slices" + "time" + + "golang.org/x/sync/errgroup" + "tailscale.com/ipn/ipnstate" + "tailscale.com/net/traffic" + "tailscale.com/syncs" + "tailscale.com/tailcfg" + "tailscale.com/util/clientmetric" + "tailscale.com/util/mak" +) + +var ( + metricPing = clientmetric.NewCounter("routecheck_ping") + metricPingError = clientmetric.NewCounter("routecheck_ping_error") + metricPingReachable = clientmetric.NewCounter("routecheck_ping_reachable") + metricPingTimeout = clientmetric.NewCounter("routecheck_ping_timeout") + metricProbe = clientmetric.NewCounter("routecheck_probe") +) + +// DefaultTimeout is the default time allowed for a response before a peer is considered unreachable. +const DefaultTimeout = 4 * time.Second + +type probed struct { + id tailcfg.NodeID + name string + addr netip.Addr + routes []netip.Prefix +} + +func (c *Client) probe(ctx context.Context, nodes iter.Seq[probed], limit int, timeout time.Duration) (*Report, error) { + metricProbe.Add(1) + + g, ctx := errgroup.WithContext(ctx) + if limit > 0 { + g.SetLimit(limit) + } + + var mu syncs.Mutex + r := &Report{} + + // TODO(sfllaw): Since the nodes are sorted by priority, + // where earlier nodes have high traffic-steering scores, + // it should be possible to deprioritize or skip probes + // if there are already enough responses for a particular resource. + // This optimization has not been implemented yet, so all nodes are probed. + for n := range nodes { + g.Go(func() error { + metricPing.Add(1) + // TODO(sfllaw): Why did we choose Disco ping instead of TSMP ping? + // After all, a TSMP ping proves that the peer Tailscale node is there + // and that both nodes know each other’s WireGuard keys, + // while a Disco ping only proves that the peer can be found using DERP. + // However, TSMP is wrapped in a long-lived WireGuard connection, + // which is too expensive when generating a reachability report. + // + // Since WireGuard connections are established using a single round-trip, + // there is no existing way to confirm that a WireGuard connection + // can be established without burdening the peer with lingering state. + // WireGuard could be extended with a special `handshake_initiation` + // that only verifies that a connection could be established, + // requesting this with a sentinel in `handshake_initiation.mac2`. + // The peer would send a valid but stateless `handshake_response`, + // using a random ephemeral_private key and not record any state. + // See https://www.wireguard.com/protocol/ and tailscale/tailscale#19670. + pong, err := c.ping(ctx, n.addr, tailcfg.PingDisco, timeout) + if err != nil { + // Returning an error would cancel the errgroup. + if err != context.DeadlineExceeded { + c.vlogf("ping %s (%s): error: %v", n.addr, n.id, err) + metricPingError.Add(1) + } + // Ping timed out, so assume that the node is unreachable. + c.vlogf("ping %s (%s): timed out", n.addr, n.id) + metricPingTimeout.Add(1) + return nil + } else if pong == nil { + c.vlogf("ping %s (%s): error: no response", n.addr, n.id) + metricPingError.Add(1) + return nil + } else { + c.vlogf("ping %s (%s): result: %f ms (err: %v)", n.addr, n.id, pong.LatencySeconds*1000, pong.Err) + metricPingReachable.Add(1) + } + + mu.Lock() + defer mu.Unlock() + if _, ok := r.Reachable[n.id]; !ok { + mak.Set(&r.Reachable, n.id, Node{ + ID: n.id, + Name: n.name, + Addr: n.addr, + Routes: n.routes, + }) + } + return nil + }) + } + g.Wait() + r.Done = time.Now() + return r, nil +} + +// Probe actively probes the sequence of nodes and returns a reachability [Report]. +// If limit is positive, it limits the number of concurrent active probes; +// a limit of zero will ping every node at once. +// A peer is considered unreachable if it doesn’t respond within the timeout. +// +// This function will probe nodes in order, so better candidates should be +// sorted earlier in the sequence. This function may use ordering to skip some probes +// if it has discovered enough reachable peers. +// +// This function tries both the IPv4 and IPv6 addresses. +func (c *Client) Probe(ctx context.Context, nodes iter.Seq[tailcfg.NodeView], limit int, timeout time.Duration) (*Report, error) { + is4, is6 := supportsIPVersions(c.nb.NodeBackend().Self()) + if is4 == nil && is6 == nil { + return nil, nil + } + addrFor := addrPicker(is4, is6) + + // Assumed nodes are ones that we assume are reachable, + // because we can’t probe nodes that don’t understand Disco pings. + var assumed []tailcfg.NodeView + + var dsts iter.Seq[probed] = func(yield func(probed) bool) { + for n := range nodes { + if n.IsWireGuardOnly() { + assumed = append(assumed, n) + continue // Probably can’t speak Disco or DERP. + } + + // Probe one of the tailnet addresses. + addr := addrFor(n) + if !addr.IsValid() { + continue // No valid addresses. + } + if !yield(probed{ + id: n.ID(), + name: n.Name(), + addr: addr, + routes: routes(n), + }) { + return + } + } + } + + r, err := c.probe(ctx, dsts, limit, timeout) + if err != nil { + return nil, err + } + + // Mix in the assumed nodes. + for _, n := range assumed { + addr := addrFor(n) + if !addr.IsValid() { + continue // No valid addresses. + } + id := n.ID() + if _, ok := r.Reachable[id]; !ok { + mak.Set(&r.Reachable, id, Node{ + ID: id, + Name: n.Name(), + Addr: addr, + Routes: routes(n), + }) + } + } + return r, nil +} + +// ProbeAllHARouters actively probes all High Availability routers in parallel +// and returns a [Report] that identifies which of these routers are reachable. +// If limit is positive, it limits the number of concurrent active probes; +// a limit of zero will ping every candidate at once. +// A peer is considered unreachable if it doesn’t respond within the timeout. +func (c *Client) ProbeAllHARouters(ctx context.Context, limit int, timeout time.Duration) (*Report, error) { + nm, err := c.waitForNetMap(ctx) + if err != nil { + return nil, err + } + + // When a prefix is routed by multiple nodes, we probe those nodes. + // There is no point to probing a router when it is the only choice. + // These nodes are referred to a High Availability (HA) routers. + var nodes []tailcfg.NodeView + for _, rs := range c.RoutersByPrefix() { + if len(rs) <= 1 { + continue + } + nodes = append(nodes, rs...) // Note: this introduces duplicates. + } + + // Sort by Node.ID and deduplicate to avoid double-probing. + slices.SortFunc(nodes, func(a, b tailcfg.NodeView) int { + return cmp.Compare(a.ID(), b.ID()) + }) + nodes = slices.CompactFunc(nodes, func(a, b tailcfg.NodeView) bool { + return a.ID() == b.ID() + }) + + // Each node should probe starting with the highest scoring node. + // We use rendezvous hashing to break ties in a consistent manner + // while still preventing swarming. + ss := traffic.ScoresFor(nm.SelfNode.ID(), nodes) + ss.SortNodes(nodes) + + return c.Probe(ctx, slices.Values(nodes), limit, timeout) +} + +// Ping returns the result of a ping to the peer handling the given IP. +// It returns a [context.DeadlineExceeded] error if the peer doesn’t respond within the timeout. +func (c *Client) ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, timeout time.Duration) (*ipnstate.PingResult, error) { + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ch := make(chan *ipnstate.PingResult, 1) + c.pinger.Ping(ip, pingType, 0, func(pr *ipnstate.PingResult) { + select { + case ch <- pr: + default: + } + }) + select { + case pr := <-ch: + return pr, nil + case <-ctx.Done(): + return nil, ctx.Err() + } +} + +func supportsIPVersions(n tailcfg.NodeView) (is4, is6 func(netip.Addr) bool) { + if !n.Valid() { + return nil, nil + } + for _, ip := range n.Addresses().All() { + addr := ip.Addr() + if addr.Is4() { + is4 = func(addr netip.Addr) bool { return addr.Is4() } + } else if addr.Is6() { + is6 = func(addr netip.Addr) bool { return addr.Is6() } + } + if is4 != nil && is6 != nil { + break + } + } + return is4, is6 +} + +func addrPicker(is4, is6 func(netip.Addr) bool) func(n tailcfg.NodeView) netip.Addr { + return func(n tailcfg.NodeView) netip.Addr { + var zero netip.Addr + for _, ip := range n.Addresses().All() { + // Find a compatible IP address. + addr := ip.Addr() + if is4 != nil && is4(addr) { + return addr + } + if is6 != nil && is6(addr) { + return addr + } + } + return zero + } +} diff --git a/ipn/routecheck/report.go b/ipn/routecheck/report.go new file mode 100644 index 000000000..f80df9179 --- /dev/null +++ b/ipn/routecheck/report.go @@ -0,0 +1,61 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +package routecheck + +import ( + "context" + "net/netip" + "time" + + "tailscale.com/tailcfg" + "tailscale.com/util/clientmetric" +) + +var ( + metricReport = clientmetric.NewCounter("routecheck_report") +) + +// Report returns the latest reachability report. +// Returns nil if a report isn’t available, which happens during initialization. +func (c *Client) Report() *Report { + metricReport.Add(1) + nm := c.nm.NetMapNoPeers() + if nm == nil { + return nil // The report wasn’t available. + } + + // TODO(sfllaw): Return the latest snapshot produced by background probing. + r, err := c.ProbeAllHARouters(context.TODO(), 5, DefaultTimeout) + if err != nil { + c.logf("reachability report error: %v", err) + } + return r +} + +// Report contains the result of a single routecheck. +type Report struct { + // Done is the time when the report was finished. + Done time.Time + + // Reachable is the set of nodes that were reachable from the current host + // when this report was compiled. Missing nodes may or may not be reachable. + Reachable map[tailcfg.NodeID]Node +} + +// Node represents a node in the reachability report. +type Node struct { + ID tailcfg.NodeID + + // Name is the FQDN of the node. + // It is also the MagicDNS name for the node. + // It has a trailing dot. + // e.g. "host.tail-scale.ts.net." + Name string + + // Addr is the IP address that was probed. + Addr netip.Addr + + // Routes are the subnets that the node will route. + Routes []netip.Prefix +} diff --git a/ipn/routecheck/routecheck.go b/ipn/routecheck/routecheck.go new file mode 100644 index 000000000..8d9aba990 --- /dev/null +++ b/ipn/routecheck/routecheck.go @@ -0,0 +1,139 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +// Package routecheck performs status checks for routes from the current host. +package routecheck + +import ( + "context" + "errors" + "net/netip" + "sync" + + "tailscale.com/ipn/ipnstate" + "tailscale.com/tailcfg" + "tailscale.com/types/logger" + "tailscale.com/types/netmap" +) + +// Client generates Reports describing the result of both passive and active +// reachability probing. +type Client struct { + // Verbose enables verbose logging. + Verbose bool + + // Logf optionally specifies where to log to. + // If nil, log.Printf is used. + Logf logger.Logf + + // These elements are read-only after initialization. + nb NodeBackender + nm NetMapper + pinger Pinger + + // NetMapAvailable is raised when the first network map is received + // after connecting to the control plane. + netMapAvailable sync.Cond +} + +// NetMapper is the interface that returns the current [netmap.NetworkMap]. +type NetMapper interface { + // NetMapNoPeers returns the latest cached network map received from + // controlclient WITHOUT a freshly-built Peers slice. + // + // On a tailnet with frequent peer churn the cached netmap's Peers slice + // can be stale relative to the live per-node-backend peers map; non-Peers + // fields (SelfNode, DNS, PacketFilter, capabilities, ...) are always + // current. Use this for any caller that does not need to iterate Peers, + // since it's O(1) regardless of tailnet size. + // + // Returns nil if no network map has been received yet. + NetMapNoPeers() *netmap.NetworkMap + + // NetMapWithPeers returns the latest network map with the Peers slice + // populated. + // + // Currently this is the same as [LocalBackend.NetMapNoPeers]: the cached + // netmap's Peers slice may be stale relative to the live per-node-backend + // peers map. A follow-up change will switch this method to return a + // freshly-built netmap with up-to-date Peers, at O(N) cost per call. + // Callers that genuinely need the up-to-date peer set should use this + // method (and document why) so the upcoming change reaches them. + // + // Returns nil if no network map has been received yet. + NetMapWithPeers() *netmap.NetworkMap +} + +// NodeBackender is the interface that returns the current [NodeBackend]. +type NodeBackender interface { + NodeBackend() NodeBackend +} + +// NodeBackend is an interface to query the current node and its peers. +// +// It is not a snapshot in time but is locked to a particular node. +type NodeBackend interface { + // Self returns the current node. + Self() tailcfg.NodeView + + // Peers returns all the current peers. + Peers() []tailcfg.NodeView +} + +// Pinger is the interface that wraps the [tailscale.com/ipn/ipnlocal.LocalBackend.Ping] method. +type Pinger interface { + Ping(ip netip.Addr, pingType tailcfg.PingType, size int, cb func(*ipnstate.PingResult)) +} + +// NewClient returns a client that probes its peers using this LocalBackend. +func NewClient(logf logger.Logf, nb NodeBackender, nm NetMapper, pinger Pinger) (*Client, error) { + if nb == nil { + return nil, errors.New("NodeBackender must be set") + } + if nm == nil { + return nil, errors.New("NetMapper must be set") + } + if pinger == nil { + return nil, errors.New("Pinger must be set") + } + c := &Client{ + Logf: logf, + nb: nb, + nm: nm, + pinger: pinger, + } + c.netMapAvailable.L = new(sync.Mutex) + return c, nil +} + +func (c *Client) NetMapAvailable(nm *netmap.NetworkMap) { + if nm == nil { + return // client disconnected + } + c.netMapAvailable.Broadcast() +} + +func (c *Client) waitForNetMap(ctx context.Context) (*netmap.NetworkMap, error) { + cond := &c.netMapAvailable + + stopf := context.AfterFunc(ctx, func() { + // Lock cond to ensure that Broadcast is called after the Wait below. + cond.L.Lock() + defer cond.L.Unlock() + cond.Broadcast() + }) + defer stopf() + + cond.L.Lock() + defer cond.L.Unlock() + for { + nm := c.nm.NetMapNoPeers() + if nm != nil { + return nm, nil + } + cond.Wait() + if err := ctx.Err(); err != nil { + return nil, err + } + } +} diff --git a/ipn/routecheck/routecheck_test.go b/ipn/routecheck/routecheck_test.go new file mode 100644 index 000000000..562991a0f --- /dev/null +++ b/ipn/routecheck/routecheck_test.go @@ -0,0 +1,487 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +package routecheck_test + +import ( + "fmt" + "maps" + "net/netip" + "slices" + "testing" + "testing/synctest" + "time" + + gcmp "github.com/google/go-cmp/cmp" + gcmpopts "github.com/google/go-cmp/cmp/cmpopts" + + "tailscale.com/ipn/ipnstate" + "tailscale.com/ipn/routecheck" + "tailscale.com/net/tsaddr" + "tailscale.com/tailcfg" + "tailscale.com/types/netmap" + "tailscale.com/util/mak" + "tailscale.com/util/set" +) + +func TestReport(t *testing.T) { + for _, tc := range []struct { + name string + init bool // true before the netmap has been loaded + peers []tailcfg.NodeView + gone []tailcfg.NodeID // cannot ping these nodes + want []tailcfg.NodeID // Report.Reachable nodes + }{ + { + name: "before-netmap", + init: true, + want: nil, + }, + { + name: "no-peers", + peers: []tailcfg.NodeView{}, + want: []tailcfg.NodeID{}, + }, + { + name: "no-routers", + peers: []tailcfg.NodeView{ + makeNode(1, withName("peer1")), + }, + want: []tailcfg.NodeID{}, + }, + { + name: "no-choice", + peers: []tailcfg.NodeView{ + makeNode(11, withName("exit11"), withExitRoutes()), + makeNode(21, withName("subnet21"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))), + }, + want: []tailcfg.NodeID{}, + }, + { + name: "all-good", + peers: []tailcfg.NodeView{ + makeNode(11, withName("exit11"), withExitRoutes()), + makeNode(12, withName("exit12"), withExitRoutes()), + makeNode(21, withName("subnet21"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))), + makeNode(22, withName("subnet22"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))), + }, + want: []tailcfg.NodeID{11, 12, 21, 22}, + }, + { + name: "none-good", + peers: []tailcfg.NodeView{ + makeNode(11, withName("exit11"), withExitRoutes()), + makeNode(12, withName("exit12"), withExitRoutes()), + makeNode(21, withName("subnet21"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))), + makeNode(22, withName("subnet22"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))), + }, + gone: []tailcfg.NodeID{11, 12, 21, 22}, + want: []tailcfg.NodeID{}, + }, + { + name: "some-good", + peers: []tailcfg.NodeView{ + makeNode(11, withName("exit11"), withExitRoutes()), + makeNode(12, withName("exit12"), withExitRoutes()), + makeNode(21, withName("subnet21"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))), + makeNode(22, withName("subnet22"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))), + }, + gone: []tailcfg.NodeID{11, 22}, + want: []tailcfg.NodeID{12, 21}, + }, + } { + makeDB := func(nodes []tailcfg.NodeView) map[tailcfg.NodeID]routecheck.Node { + if len(nodes) == 0 { + return nil + } + db := make(map[tailcfg.NodeID]routecheck.Node) + for _, n := range tc.peers { + db[n.ID()] = routecheck.Node{ + ID: n.ID(), + Name: n.Name(), + Addr: n.Addresses().At(0).Addr(), + Routes: n.AllowedIPs().AsSlice()[2:], + } + } + return db + } + cmpDiff := func(want, got any) string { + return gcmp.Diff(want, got, + gcmpopts.EquateComparable(netip.Addr{}, netip.Prefix{})) + } + + t.Run(tc.name, func(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + // The backend is initialized without a NetMap. + b := newStubBackend(tailcfg.NodeView{}, nil, withGone(tc.gone...)) + if !tc.init { + self := makeNode(99, withName("self")) + b = newStubBackend(self, tc.peers, withGone(tc.gone...)) + } + c, err := routecheck.NewClient(t.Logf, b, b, b) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + got := c.Report() + now := time.Now() // synctest will freeze time. + + var want *routecheck.Report + peers := makeDB(tc.peers) + if !tc.init { + want = &routecheck.Report{ + Done: now, + } + for _, nid := range tc.want { + mak.Set(&want.Reachable, nid, peers[nid]) + } + } + + if diff := cmpDiff(want, got); diff != "" { + t.Errorf("-want +got:\n%s", diff) + } + }) + }) + } +} + +func TestRoutersByPrefix(t *testing.T) { + type routersByPrefix map[netip.Prefix][]tailcfg.NodeID + simplify := func(rs routecheck.RoutersByPrefix) routersByPrefix { + out := make(routersByPrefix, len(rs)) + for p, ns := range rs { + for _, n := range ns { + out[p] = append(out[p], n.ID()) + } + slices.Sort(out[p]) + } + return out + } + + for _, tc := range []struct { + name string + peers []tailcfg.NodeView + want routersByPrefix + }{ + { + name: "no-peers", + peers: []tailcfg.NodeView{}, + want: routersByPrefix{}, + }, + { + name: "no-routers", + peers: []tailcfg.NodeView{ + makeNode(1, withName("peer1")), + }, + want: routersByPrefix{}, + }, + { + name: "one-exit-node", + peers: []tailcfg.NodeView{ + makeNode(1, withName("peer1")), + makeNode(11, withName("exit11"), withExitRoutes()), + }, + want: routersByPrefix{ + netip.MustParsePrefix("0.0.0.0/0"): {11}, + netip.MustParsePrefix("::/0"): {11}, + }, + }, + { + name: "overlapping-exit-nodes", + peers: []tailcfg.NodeView{ + makeNode(1, withName("peer1")), + makeNode(11, withName("exit11"), withExitRoutes()), + makeNode(12, withName("exit12"), withExitRoutes()), + }, + want: routersByPrefix{ + netip.MustParsePrefix("0.0.0.0/0"): {11, 12}, + netip.MustParsePrefix("::/0"): {11, 12}, + }, + }, + { + name: "one-subnet-router", + peers: []tailcfg.NodeView{ + makeNode(1, withName("peer1")), + makeNode(21, withName("subnet21"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))), + }, + want: routersByPrefix{ + netip.MustParsePrefix("192.168.1.0/24"): {21}, + netip.MustParsePrefix("2002:c000:0100::/48"): {21}, + }, + }, + { + name: "overlapping-subnet-routers", + peers: []tailcfg.NodeView{ + makeNode(1, withName("peer1")), + makeNode(21, withName("subnet21"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))), + makeNode(22, withName("subnet22"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))), + }, + want: routersByPrefix{ + netip.MustParsePrefix("192.168.1.0/24"): {21, 22}, + netip.MustParsePrefix("2002:c000:0100::/48"): {21, 22}, + }, + }, + { + name: "disjoint-subnet-routers", + peers: []tailcfg.NodeView{ + makeNode(1, withName("peer1")), + makeNode(21, withName("subnet21"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48"))), + makeNode(22, withName("subnet22"), + withRoutes(netip.MustParsePrefix("192.168.2.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0200::/48"))), + }, + want: routersByPrefix{ + netip.MustParsePrefix("192.168.1.0/24"): {21}, + netip.MustParsePrefix("2002:c000:0100::/48"): {21}, + netip.MustParsePrefix("192.168.2.0/24"): {22}, + netip.MustParsePrefix("2002:c000:0200::/48"): {22}, + }, + }, + { + name: "multiple-routes", + peers: []tailcfg.NodeView{ + makeNode(1, withName("peer1")), + makeNode(21, withName("subnet21"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48")), + withRoutes(netip.MustParsePrefix("192.168.2.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0200::/48"))), + makeNode(22, withName("subnet22"), + withRoutes(netip.MustParsePrefix("192.168.2.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0200::/48")), + withRoutes(netip.MustParsePrefix("192.168.3.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0300::/48"))), + makeNode(23, withName("subnet23"), + withRoutes(netip.MustParsePrefix("192.168.3.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0300::/48")), + withRoutes(netip.MustParsePrefix("192.168.4.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0400::/48"))), + }, + want: routersByPrefix{ + netip.MustParsePrefix("192.168.1.0/24"): {21}, + netip.MustParsePrefix("2002:c000:0100::/48"): {21}, + netip.MustParsePrefix("192.168.2.0/24"): {21, 22}, + netip.MustParsePrefix("2002:c000:0200::/48"): {21, 22}, + netip.MustParsePrefix("192.168.3.0/24"): {22, 23}, + netip.MustParsePrefix("2002:c000:0300::/48"): {22, 23}, + netip.MustParsePrefix("192.168.4.0/24"): {23}, + netip.MustParsePrefix("2002:c000:0400::/48"): {23}, + }, + }, + { + name: "both-exit-nodes-and-routers", + peers: []tailcfg.NodeView{ + makeNode(1, withName("peer1")), + makeNode(11, withName("exit11"), withExitRoutes()), + makeNode(12, withName("exit12"), withExitRoutes()), + makeNode(21, withName("subnet21"), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48")), + withRoutes(netip.MustParsePrefix("192.168.2.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0200::/48"))), + makeNode(22, withName("subnet22"), + withRoutes(netip.MustParsePrefix("192.168.2.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0200::/48")), + withRoutes(netip.MustParsePrefix("192.168.3.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0300::/48"))), + }, + want: routersByPrefix{ + netip.MustParsePrefix("0.0.0.0/0"): {11, 12}, + netip.MustParsePrefix("::/0"): {11, 12}, + netip.MustParsePrefix("192.168.1.0/24"): {21}, + netip.MustParsePrefix("2002:c000:0100::/48"): {21}, + netip.MustParsePrefix("192.168.2.0/24"): {21, 22}, + netip.MustParsePrefix("2002:c000:0200::/48"): {21, 22}, + netip.MustParsePrefix("192.168.3.0/24"): {22}, + netip.MustParsePrefix("2002:c000:0300::/48"): {22}, + }, + }, + { + name: "mixed-nodes", + peers: []tailcfg.NodeView{ + makeNode(1, withName("peer1")), + makeNode(31, withName("router31"), + withExitRoutes(), + withRoutes(netip.MustParsePrefix("192.168.1.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0100::/48")), + withRoutes(netip.MustParsePrefix("192.168.2.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0200::/48"))), + makeNode(32, withName("router32"), + withExitRoutes(), + withRoutes(netip.MustParsePrefix("192.168.2.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0200::/48")), + withRoutes(netip.MustParsePrefix("192.168.3.0/24")), + withRoutes(netip.MustParsePrefix("2002:c000:0300::/48"))), + }, + want: routersByPrefix{ + netip.MustParsePrefix("0.0.0.0/0"): {31, 32}, + netip.MustParsePrefix("::/0"): {31, 32}, + netip.MustParsePrefix("192.168.1.0/24"): {31}, + netip.MustParsePrefix("2002:c000:0100::/48"): {31}, + netip.MustParsePrefix("192.168.2.0/24"): {31, 32}, + netip.MustParsePrefix("2002:c000:0200::/48"): {31, 32}, + netip.MustParsePrefix("192.168.3.0/24"): {32}, + netip.MustParsePrefix("2002:c000:0300::/48"): {32}, + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + self := makeNode(99, withName("self")) + b := newStubBackend(self, tc.peers) + c, err := routecheck.NewClient(t.Logf, b, b, b) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + got := simplify(c.RoutersByPrefix()) + if !maps.EqualFunc(got, tc.want, slices.Equal) { + t.Errorf("got %+v, want %+v", got, tc.want) + } + }) + } + +} + +type nodeOptFunc func(*tailcfg.Node) + +func makeNode(id tailcfg.NodeID, opts ...nodeOptFunc) tailcfg.NodeView { + addresses := []netip.Prefix{ + netip.MustParsePrefix(fmt.Sprintf("192.168.0.%d/32", id)), + netip.MustParsePrefix(fmt.Sprintf("fd7a:115c:a1e0::%d/128", id)), + } + node := &tailcfg.Node{ + ID: id, + StableID: tailcfg.StableNodeID(fmt.Sprintf("stable%d", id)), + Name: fmt.Sprintf("node%d", id), + Online: new(true), + MachineAuthorized: true, + HomeDERP: int(id), + Addresses: addresses, + AllowedIPs: addresses, + } + for _, opt := range opts { + opt(node) + } + return node.View() +} + +func withExitRoutes() nodeOptFunc { + return withRoutes(tsaddr.ExitRoutes()...) +} + +func withName(name string) nodeOptFunc { + return func(n *tailcfg.Node) { + n.Name = name + } +} + +func withRoutes(routes ...netip.Prefix) nodeOptFunc { + return func(n *tailcfg.Node) { + n.AllowedIPs = append(n.AllowedIPs, routes...) + } +} + +var _ routecheck.NodeBackender = &stubBackend{} +var _ routecheck.NodeBackend = &stubBackend{} +var _ routecheck.NetMapper = &stubBackend{} +var _ routecheck.Pinger = &stubBackend{} + +type stubBackend struct { + self tailcfg.NodeView + peers []tailcfg.NodeView + gone set.Set[tailcfg.NodeID] +} + +type backendOptFunc func(*stubBackend) + +func newStubBackend(self tailcfg.NodeView, peers []tailcfg.NodeView, opts ...backendOptFunc) *stubBackend { + b := &stubBackend{ + self: self, + peers: slices.Clone(peers), + } + for _, opt := range opts { + opt(b) + } + return b +} + +func (b *stubBackend) NetMapNoPeers() *netmap.NetworkMap { + if !b.self.Valid() { + return nil + } + return &netmap.NetworkMap{ + SelfNode: b.self, + Peers: nil, // No peers. + } +} + +func (b *stubBackend) NetMapWithPeers() *netmap.NetworkMap { + nm := b.NetMapNoPeers() + if nm != nil { + nm.Peers = b.peers + } + return nm +} + +func (nb *stubBackend) NodeBackend() routecheck.NodeBackend { + return nb +} + +func (nb *stubBackend) Self() tailcfg.NodeView { + return nb.self +} + +func (nb *stubBackend) Peers() []tailcfg.NodeView { + return nb.peers +} + +func (b *stubBackend) Ping(ip netip.Addr, pingType tailcfg.PingType, size int, cb func(*ipnstate.PingResult)) { + // Does the IP address match one of the peers’ addresses? + for _, n := range b.peers { + for _, a := range n.Addresses().All() { + if a.Addr() != ip { + continue + } + + if b.gone.Contains(n.ID()) { + continue + } + + go cb(&ipnstate.PingResult{ + IP: ip.String(), + NodeIP: ip.String(), + NodeName: n.Name(), + LatencySeconds: 0.01, + }) + } + } +} + +func withGone(gone ...tailcfg.NodeID) backendOptFunc { + return func(b *stubBackend) { + b.gone = set.SetOf(gone) + } + +} diff --git a/ipn/routecheck/routes.go b/ipn/routecheck/routes.go new file mode 100644 index 000000000..b14d67db8 --- /dev/null +++ b/ipn/routecheck/routes.go @@ -0,0 +1,51 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +package routecheck + +import ( + "net/netip" + + "tailscale.com/tailcfg" + "tailscale.com/util/mak" +) + +// RoutersByPrefix represents a map of nodes grouped by the subnet that they route. +type RoutersByPrefix map[netip.Prefix][]tailcfg.NodeView + +// RoutersByPrefix returns a map of nodes grouped by the subnet that they route. +// Nodes that route for /0 prefixes are exit nodes, their subnet is the Internet. +// The result omits any prefix that is one of a node’s local addresses. +// +// Note: Fallback routes are not supported by design. If a subnet prefix +// contained within another more general prefix has no reachable routers, +// traffic is still sent to one of those unreachable routers. +// Routers for the general prefix aren’t candidates. See tailscale/tailscale#18550. +func (c *Client) RoutersByPrefix() RoutersByPrefix { + var routers RoutersByPrefix + for _, n := range c.nb.NodeBackend().Peers() { + for _, pfx := range routes(n) { + mak.Set(&routers, pfx, append(routers[pfx], n)) + } + } + return routers +} + +// Routes returns a slice of subnets that the given node will route. +// If the node is an exit node, the result will contain at least one /0 prefix. +// If the node is a subnet router, the result will contain a smaller prefix. +// The result omits any prefix that is one of the node’s local addresses. +func routes(n tailcfg.NodeView) []netip.Prefix { + var routes []netip.Prefix +AllowedIPs: + for _, pfx := range n.AllowedIPs().All() { + // Routers never forward their own local addresses. + for _, addr := range n.Addresses().All() { + if pfx == addr { + continue AllowedIPs + } + } + routes = append(routes, pfx) + } + return routes +}