mirror of
https://github.com/tailscale/tailscale.git
synced 2026-04-03 22:25:27 -04:00
ipn/routecheck: introduce new package for checking peer reachability
The routecheck package parallels the netcheck package, where the former checks routes and routers while the latter checks networks. Like netcheck, it compiles reports for other systems to consume. Historically, the client has never known whether a peer is actually reachable. Most of the time this doesn’t matter, since the client will want to establish a WireGuard tunnel to any given destination. However, if the client needs to choose between two or more nodes, then it should only choose a node that it can reach. Suggested exit nodes are one such example, where the client filters out any nodes that aren’t connected to the control plane. Sometimes an exit node will get disconnected from the control plane: when the network between the two is unreliable or when the exit node is too busy to keep its control connection alive. In these cases, Control disables the Node.Online flag for the exit node and broadcasts this across the tailnet. Arguably, the client should never have relied on this flag, since it only makes sense in the admin console. This PR implements an initial routecheck client that can probe every node that your client knows about. You should not ping scan your visible tailnet, this method is for debugging only. Updates #17366 Updates tailscale/corp#33033 Signed-off-by: Simon Law <sfllaw@tailscale.com>
This commit is contained in:
@@ -1570,6 +1570,11 @@ func (b *LocalBackend) PeerCaps(src netip.Addr) tailcfg.PeerCapMap {
|
||||
return b.currentNode().PeerCaps(src)
|
||||
}
|
||||
|
||||
// Peers returns all the current peers in an undefined order.
|
||||
func (b *LocalBackend) Peers() []tailcfg.NodeView {
|
||||
return b.currentNode().Peers()
|
||||
}
|
||||
|
||||
func (b *LocalBackend) GetFilterForTest() *filter.Filter {
|
||||
testenv.AssertInTest()
|
||||
nb := b.currentNode()
|
||||
|
||||
22
ipn/routecheck/log.go
Normal file
22
ipn/routecheck/log.go
Normal file
@@ -0,0 +1,22 @@
|
||||
// Copyright (c) Tailscale Inc & contributors
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
package routecheck
|
||||
|
||||
import (
|
||||
"log"
|
||||
)
|
||||
|
||||
func (c *Client) logf(format string, a ...any) {
|
||||
if c.Logf != nil {
|
||||
c.Logf(format, a...)
|
||||
} else {
|
||||
log.Printf(format, a...)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) vlogf(format string, a ...any) {
|
||||
if c.Verbose || debugRoutecheck() {
|
||||
c.logf(format, a...)
|
||||
}
|
||||
}
|
||||
252
ipn/routecheck/routecheck.go
Normal file
252
ipn/routecheck/routecheck.go
Normal file
@@ -0,0 +1,252 @@
|
||||
// Copyright (c) Tailscale Inc & contributors
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
// Package routecheck performs status checks for routes from the current host.
|
||||
package routecheck
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"context"
|
||||
"errors"
|
||||
"iter"
|
||||
"math/rand/v2"
|
||||
"net/netip"
|
||||
"slices"
|
||||
"time"
|
||||
|
||||
"golang.org/x/sync/errgroup"
|
||||
"tailscale.com/envknob"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/ipn/ipnstate"
|
||||
"tailscale.com/syncs"
|
||||
"tailscale.com/tailcfg"
|
||||
"tailscale.com/types/logger"
|
||||
"tailscale.com/types/netmap"
|
||||
"tailscale.com/util/mak"
|
||||
"tailscale.com/util/set"
|
||||
)
|
||||
|
||||
// Debugging and experimentation tweakables.
|
||||
var (
|
||||
debugRoutecheck = envknob.RegisterBool("TS_DEBUG_ROUTECHECK")
|
||||
)
|
||||
|
||||
// Report contains the result of a single routecheck.
|
||||
type Report struct {
|
||||
// Now is the time when the report was finished.
|
||||
Now time.Time
|
||||
|
||||
// Reachable is the set of nodes that were reachable from the current host
|
||||
// when this report was compiled. Missing nodes may or may not be reachable.
|
||||
reachable set.Set[tailcfg.NodeID]
|
||||
}
|
||||
|
||||
// Client generates Reports describing the result of both passive and active
|
||||
// reachability probing.
|
||||
type Client struct {
|
||||
// Verbose enables verbose logging.
|
||||
Verbose bool
|
||||
|
||||
// Logf optionally specifies where to log to.
|
||||
// If nil, log.Printf is used.
|
||||
Logf logger.Logf
|
||||
|
||||
// These elements are read-only after initialization.
|
||||
b LocalBackend
|
||||
}
|
||||
|
||||
type LocalBackend interface {
|
||||
NetMap() *netmap.NetworkMap
|
||||
Peers() []tailcfg.NodeView
|
||||
Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, size int) (*ipnstate.PingResult, error)
|
||||
WatchNotifications(ctx context.Context, mask ipn.NotifyWatchOpt, onWatchAdded func(), fn func(roNotify *ipn.Notify) (keepGoing bool))
|
||||
}
|
||||
|
||||
// NewClient returns a client that probes using this [ipnlocal.LocalBackend].
|
||||
func NewClient(b LocalBackend) (*Client, error) {
|
||||
if b == nil {
|
||||
return nil, errors.New("LocalBackend must be set")
|
||||
}
|
||||
return &Client{b: b}, nil
|
||||
}
|
||||
|
||||
// Probe actively probes the sequence of nodes and returns a reachability [Report].
|
||||
// If limit is positive, it limits the number of concurrent active probes;
|
||||
// a limit of zero will ping every node at once.
|
||||
// This function tries both the IPv4 and IPv6 addresses
|
||||
func (c *Client) Probe(ctx context.Context, nodes iter.Seq[tailcfg.NodeView], limit int) (*Report, error) {
|
||||
var canIPv4, canIPv6 bool
|
||||
for _, ip := range c.b.NetMap().SelfNode.Addresses().All() {
|
||||
addr := ip.Addr()
|
||||
if addr.Is4() {
|
||||
canIPv4 = true
|
||||
} else if addr.Is6() {
|
||||
canIPv6 = true
|
||||
}
|
||||
}
|
||||
|
||||
g, ctx := errgroup.WithContext(ctx)
|
||||
if limit > 0 {
|
||||
g.SetLimit(limit)
|
||||
}
|
||||
|
||||
var (
|
||||
mu syncs.Mutex
|
||||
r = &Report{
|
||||
reachable: make(set.Set[tailcfg.NodeID]),
|
||||
}
|
||||
)
|
||||
|
||||
for n := range nodes {
|
||||
// Ping one of the tailnet addresses.
|
||||
for _, ip := range n.Addresses().All() {
|
||||
// Skip this probe if there is an IP version mismatch.
|
||||
addr := ip.Addr()
|
||||
if addr.Is4() && !canIPv4 {
|
||||
continue
|
||||
}
|
||||
if addr.Is6() && !canIPv6 {
|
||||
continue
|
||||
}
|
||||
|
||||
g.Go(func() error {
|
||||
nid := n.ID()
|
||||
pong, err := c.b.Ping(ctx, addr, tailcfg.PingTSMP, 0)
|
||||
if err != nil {
|
||||
// Returning an error would cancel the errgroup.
|
||||
c.vlogf("ping %s (%s): error: %v", addr, nid, err)
|
||||
} else {
|
||||
c.vlogf("ping %s (%s): result: %f ms (err: %v)", addr, nid, pong.LatencySeconds*1000, pong.Err)
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
r.reachable.Add(nid)
|
||||
return nil
|
||||
})
|
||||
break
|
||||
}
|
||||
}
|
||||
g.Wait()
|
||||
r.Now = time.Now()
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// ProbeAllPeers actively probes all peers in parallel and returns a [Report]
|
||||
// that identifies which nodes are reachable. If limit is positive, it limits
|
||||
// the number of concurrent active probes; a limit of zero will ping every
|
||||
// candidate at once.
|
||||
func (c *Client) ProbeAllPeers(ctx context.Context, limit int) (*Report, error) {
|
||||
nm := c.waitForInitialNetMap(ctx)
|
||||
return c.Probe(ctx, slices.Values(nm.Peers), limit)
|
||||
}
|
||||
|
||||
// ProbeAllHARouters actively probes all High Availability routers in parallel
|
||||
// and returns a [Report] that identifies which of these routers are reachable.
|
||||
// If limit is positive, it limits the number of concurrent active probes;
|
||||
// a limit of zero will ping every candidate at once.
|
||||
func (c *Client) ProbeAllHARouters(ctx context.Context, limit int) (*Report, error) {
|
||||
nm := c.waitForInitialNetMap(ctx)
|
||||
|
||||
// When a prefix is routed by multiple nodes, we probe those nodes.
|
||||
// There is no point to probing a router when it is the only choice.
|
||||
// These nodes are referred to a High Availability (HA) routers.
|
||||
var nodes []tailcfg.NodeView
|
||||
for _, rs := range c.RoutersByPrefix() {
|
||||
if len(rs) <= 1 {
|
||||
continue
|
||||
}
|
||||
nodes = append(nodes, rs...) // Note: this introduces duplicates.
|
||||
}
|
||||
|
||||
// Sort by Node.ID and deduplicate to avoid double-probing.
|
||||
slices.SortFunc(nodes, func(a, b tailcfg.NodeView) int {
|
||||
return cmp.Compare(a.ID(), b.ID())
|
||||
})
|
||||
slices.CompactFunc(nodes, func(a, b tailcfg.NodeView) bool {
|
||||
return a.ID() == b.ID()
|
||||
})
|
||||
|
||||
// To prevent swarming, each node should probe in a different order.
|
||||
seed := uint64(nm.SelfNode.ID())
|
||||
rnd := rand.New(rand.NewPCG(seed, seed))
|
||||
rnd.Shuffle(len(nodes), func(i, j int) {
|
||||
nodes[i], nodes[j] = nodes[j], nodes[i]
|
||||
})
|
||||
|
||||
return c.Probe(ctx, slices.Values(nodes), limit)
|
||||
}
|
||||
|
||||
// WaitForInitialNetMap returns the current [netmap.NetworkMap], if present.
|
||||
// If the network map is missing because the client just started,
|
||||
// this function will wait for the control plane to send it before returning.
|
||||
func (c *Client) waitForInitialNetMap(ctx context.Context) *netmap.NetworkMap {
|
||||
nm := c.b.NetMap()
|
||||
if nm != nil {
|
||||
return nm
|
||||
}
|
||||
|
||||
// Wait for the initial NetworkMap to arrive:
|
||||
c.b.WatchNotifications(ctx, ipn.NotifyInitialNetMap, nil, func(n *ipn.Notify) (keepGoing bool) {
|
||||
nm = n.NetMap
|
||||
return nm == nil // Keep going until nm contains a network map.
|
||||
})
|
||||
return nm
|
||||
}
|
||||
|
||||
// Routers returns a sequence of nodes that are routers, which will advertise
|
||||
// more [tailcfg.Node.AllowedIPs] than the node’s own [tailcfg.Node.Addresses].
|
||||
func (c *Client) Routers() iter.Seq[tailcfg.NodeView] {
|
||||
return func(yield func(tailcfg.NodeView) bool) {
|
||||
for _, n := range c.b.Peers() {
|
||||
AllowedIPs:
|
||||
for _, pfx := range n.AllowedIPs().All() {
|
||||
// Routers never forward their own local addresses.
|
||||
for _, addr := range n.Addresses().All() {
|
||||
if pfx == addr {
|
||||
continue AllowedIPs
|
||||
}
|
||||
}
|
||||
if !yield(n) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RoutersByPrefix returns a map of nodes that route for a particular subnet.
|
||||
// Nodes that route for /0 prefixes are exit nodes, their subnet is the Internet.
|
||||
func (c *Client) RoutersByPrefix() map[netip.Prefix][]tailcfg.NodeView {
|
||||
var routers map[netip.Prefix][]tailcfg.NodeView
|
||||
for _, n := range c.b.Peers() {
|
||||
for _, pfx := range n.AllowedIPs().All() {
|
||||
mak.Set(&routers, pfx, append(routers[pfx], n))
|
||||
}
|
||||
continue
|
||||
}
|
||||
return routers
|
||||
}
|
||||
|
||||
// Routes returns a slice of subnets that the given node will route.
|
||||
// If the node is an exit node, the result will contain at least one /0 prefix.
|
||||
// If the node is a subnet router, the result will contain a smaller prefix.
|
||||
// The result omits any prefix that is one of the node’s local addresses.
|
||||
func routes(n tailcfg.NodeView) []netip.Prefix {
|
||||
var routes []netip.Prefix
|
||||
AllowedIPs:
|
||||
for _, pfx := range n.AllowedIPs().All() {
|
||||
// Routers never forward their own local addresses.
|
||||
for _, addr := range n.Addresses().All() {
|
||||
if pfx == addr {
|
||||
continue AllowedIPs
|
||||
}
|
||||
}
|
||||
routes = append(routes, pfx)
|
||||
}
|
||||
return routes
|
||||
}
|
||||
|
||||
// EarlyExit is used to exit early out of a [ipnext.NodeBackend.AppendMatchingPeers] loop.
|
||||
// It is a sentinel type used by panic and recover.
|
||||
type earlyExit struct{}
|
||||
Reference in New Issue
Block a user