Files
tailscale/wgengine/userspace_test.go
Claus Lensbøl be7cce74ba wgengine/userspace: do not fall back to old key on tsmpLearned mismatch (#19575)
The mismatch behaviour of falling back to a previous key could end up
breaking connections when the netmap update took longer than the 2
seconds allowed in controlClient.auto for netmap updates, or if the
controlClient context was canceled. This could end up breaking
legitimate updates to the netmap for disco keys coming from control.

Instead, log the event, and let the connection be reset to that of the
key as that is safer.

Issue found by @bradfitz.

Updates #19574

Signed-off-by: Claus Lensbøl <claus@tailscale.com>
2026-04-29 13:23:04 -04:00

729 lines
18 KiB
Go

// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package wgengine
import (
"fmt"
"math/rand"
"net/netip"
"os"
"reflect"
"runtime"
"testing"
"go4.org/mem"
"tailscale.com/cmd/testwrapper/flakytest"
"tailscale.com/control/controlknobs"
"tailscale.com/envknob"
"tailscale.com/health"
"tailscale.com/net/dns"
"tailscale.com/net/netaddr"
"tailscale.com/net/tstun"
"tailscale.com/tailcfg"
"tailscale.com/tstest"
"tailscale.com/tstime/mono"
"tailscale.com/types/key"
"tailscale.com/types/netmap"
"tailscale.com/types/opt"
"tailscale.com/types/views"
"tailscale.com/util/eventbus/eventbustest"
"tailscale.com/util/usermetric"
"tailscale.com/wgengine/router"
"tailscale.com/wgengine/wgcfg"
)
func TestNoteReceiveActivity(t *testing.T) {
now := mono.Time(123456)
var logBuf tstest.MemLogger
confc := make(chan bool, 1)
gotConf := func() bool {
select {
case <-confc:
return true
default:
return false
}
}
e := &userspaceEngine{
timeNow: func() mono.Time { return now },
recvActivityAt: map[key.NodePublic]mono.Time{},
logf: logBuf.Logf,
tundev: new(tstun.Wrapper),
testMaybeReconfigHook: func() { confc <- true },
trimmedNodes: map[key.NodePublic]bool{},
}
ra := e.recvActivityAt
nk := key.NewNode().Public()
// Activity on an untracked key should do nothing.
e.noteRecvActivity(nk)
if len(ra) != 0 {
t.Fatalf("unexpected growth in map: now has %d keys; want 0", len(ra))
}
if logBuf.Len() != 0 {
t.Fatalf("unexpected log write (and thus activity): %s", logBuf.Bytes())
}
// Now track it, but don't mark it trimmed, so shouldn't update.
ra[nk] = 0
e.noteRecvActivity(nk)
if len(ra) != 1 {
t.Fatalf("unexpected growth in map: now has %d keys; want 1", len(ra))
}
if got := ra[nk]; got != now {
t.Fatalf("time in map = %v; want %v", got, now)
}
if gotConf() {
t.Fatalf("unexpected reconfig")
}
// Now mark it trimmed and expect an update.
e.trimmedNodes[nk] = true
e.noteRecvActivity(nk)
if len(ra) != 1 {
t.Fatalf("unexpected growth in map: now has %d keys; want 1", len(ra))
}
if got := ra[nk]; got != now {
t.Fatalf("time in map = %v; want %v", got, now)
}
if !gotConf() {
t.Fatalf("didn't get expected reconfig")
}
}
func nodeViews(v []*tailcfg.Node) []tailcfg.NodeView {
nv := make([]tailcfg.NodeView, len(v))
for i, n := range v {
nv[i] = n.View()
}
return nv
}
func TestUserspaceEngineReconfig(t *testing.T) {
bus := eventbustest.NewBus(t)
ht := health.NewTracker(bus)
reg := new(usermetric.Registry)
e, err := NewFakeUserspaceEngine(t.Logf, 0, ht, reg, bus)
if err != nil {
t.Fatal(err)
}
t.Cleanup(e.Close)
ue := e.(*userspaceEngine)
routerCfg := &router.Config{}
for _, nodeHex := range []string{
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
} {
nm := &netmap.NetworkMap{
Peers: nodeViews([]*tailcfg.Node{
{
ID: 1,
Key: nkFromHex(nodeHex),
},
}),
}
nk, err := key.ParseNodePublicUntyped(mem.S(nodeHex))
if err != nil {
t.Fatal(err)
}
cfg := &wgcfg.Config{
Peers: []wgcfg.Peer{
{
PublicKey: nk,
AllowedIPs: []netip.Prefix{
netip.PrefixFrom(netaddr.IPv4(100, 100, 99, 1), 32),
},
},
},
}
e.SetNetworkMap(nm)
err = e.Reconfig(cfg, routerCfg, &dns.Config{})
if err != nil {
t.Fatal(err)
}
wantRecvAt := map[key.NodePublic]mono.Time{
nkFromHex(nodeHex): 0,
}
if got := ue.recvActivityAt; !reflect.DeepEqual(got, wantRecvAt) {
t.Errorf("wrong recvActivityAt\n got: %v\nwant: %v\n", got, wantRecvAt)
}
wantTrimmedNodes := map[key.NodePublic]bool{
nkFromHex(nodeHex): true,
}
if got := ue.trimmedNodes; !reflect.DeepEqual(got, wantTrimmedNodes) {
t.Errorf("wrong wantTrimmedNodes\n got: %v\nwant: %v\n", got, wantTrimmedNodes)
}
}
}
func TestUserspaceEngineTSMPLearned(t *testing.T) {
bus := eventbustest.NewBus(t)
ht := health.NewTracker(bus)
reg := new(usermetric.Registry)
e, err := NewFakeUserspaceEngine(t.Logf, 0, ht, reg, bus)
if err != nil {
t.Fatal(err)
}
t.Cleanup(e.Close)
ue := e.(*userspaceEngine)
discoChangedChan := make(chan map[key.NodePublic]bool, 1)
ue.testDiscoChangedHook = func(m map[key.NodePublic]bool) {
discoChangedChan <- m
}
routerCfg := &router.Config{}
keyChanges := []struct {
tsmp bool
inMap bool
}{
{tsmp: false, inMap: false},
{tsmp: true, inMap: false},
{tsmp: false, inMap: true},
}
nkHex := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
for _, change := range keyChanges {
oldDisco := key.NewDisco()
nm := &netmap.NetworkMap{
Peers: nodeViews([]*tailcfg.Node{
{
ID: 1,
Key: nkFromHex(nkHex),
DiscoKey: oldDisco.Public(),
},
}),
}
nk, err := key.ParseNodePublicUntyped(mem.S(nkHex))
if err != nil {
t.Fatal(err)
}
e.SetNetworkMap(nm)
newDisco := key.NewDisco()
cfg := &wgcfg.Config{
Peers: []wgcfg.Peer{
{
PublicKey: nk,
DiscoKey: newDisco.Public(),
},
},
}
if change.tsmp {
ue.PatchDiscoKey(nk, newDisco.Public())
}
err = e.Reconfig(cfg, routerCfg, &dns.Config{})
if err != nil {
t.Fatal(err)
}
changeMap := <-discoChangedChan
if _, ok := changeMap[nk]; ok != change.inMap {
t.Fatalf("expect key %v in map %v to be %t, got %t", nk, changeMap,
change.inMap, ok)
}
}
}
func TestUserspaceEngineTSMPLearnedMismatch(t *testing.T) {
bus := eventbustest.NewBus(t)
ht := health.NewTracker(bus)
reg := new(usermetric.Registry)
e, err := NewFakeUserspaceEngine(t.Logf, 0, ht, reg, bus)
if err != nil {
t.Fatal(err)
}
t.Cleanup(e.Close)
ue := e.(*userspaceEngine)
discoChangedChan := make(chan map[key.NodePublic]bool, 1)
ue.testDiscoChangedHook = func(m map[key.NodePublic]bool) {
discoChangedChan <- m
}
routerCfg := &router.Config{}
var metricValue int64 = 0
keyChanges := []struct {
tsmp bool
inMap bool
wrongKey bool
}{
{tsmp: false, inMap: false, wrongKey: false},
{tsmp: true, inMap: false, wrongKey: false},
{tsmp: true, inMap: true, wrongKey: true},
{tsmp: false, inMap: true, wrongKey: false},
}
nkHex := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
for _, change := range keyChanges {
oldDisco := key.NewDisco()
nm := &netmap.NetworkMap{
Peers: nodeViews([]*tailcfg.Node{
{
ID: 1,
Key: nkFromHex(nkHex),
DiscoKey: oldDisco.Public(),
},
}),
}
nk, err := key.ParseNodePublicUntyped(mem.S(nkHex))
if err != nil {
t.Fatal(err)
}
e.SetNetworkMap(nm)
newDisco := key.NewDisco()
cfg := &wgcfg.Config{
Peers: []wgcfg.Peer{
{
PublicKey: nk,
DiscoKey: newDisco.Public(),
},
},
}
tsmpKey := newDisco.Public()
if change.tsmp {
if change.wrongKey {
tsmpKey = key.NewDisco().Public()
}
ue.PatchDiscoKey(nk, tsmpKey)
}
err = e.Reconfig(cfg, routerCfg, &dns.Config{})
if err != nil {
t.Fatal(err)
}
changeMap := <-discoChangedChan
if _, ok := changeMap[nk]; ok != change.inMap {
t.Fatalf("expect key %v in map %v to be %t, got %t", nk, changeMap,
change.inMap, ok)
}
metric := metricTSMPLearnedKeyMismatch.Value()
delta := metric - metricValue
metricValue = metric
if change.wrongKey && delta != 1 {
t.Fatalf("expected a delta of 1, got %d", delta)
}
}
}
func TestUserspaceEnginePortReconfig(t *testing.T) {
flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/2855")
const defaultPort = 49983
var knobs controlknobs.Knobs
bus := eventbustest.NewBus(t)
// Keep making a wgengine until we find an unused port
var ue *userspaceEngine
ht := health.NewTracker(bus)
reg := new(usermetric.Registry)
for range 100 {
attempt := uint16(defaultPort + rand.Intn(1000))
e, err := NewFakeUserspaceEngine(t.Logf, attempt, &knobs, ht, reg, bus)
if err != nil {
t.Fatal(err)
}
ue = e.(*userspaceEngine)
if ue.magicConn.LocalPort() == attempt {
break
}
ue.Close()
ue = nil
}
if ue == nil {
t.Fatal("could not create a wgengine with a specific port")
}
t.Cleanup(ue.Close)
startingPort := ue.magicConn.LocalPort()
nodeKey, err := key.ParseNodePublicUntyped(mem.S("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"))
if err != nil {
t.Fatal(err)
}
cfg := &wgcfg.Config{
Peers: []wgcfg.Peer{
{
PublicKey: nodeKey,
AllowedIPs: []netip.Prefix{
netip.PrefixFrom(netaddr.IPv4(100, 100, 99, 1), 32),
},
},
},
}
routerCfg := &router.Config{}
if err := ue.Reconfig(cfg, routerCfg, &dns.Config{}); err != nil {
t.Fatal(err)
}
if got := ue.magicConn.LocalPort(); got != startingPort {
t.Errorf("no debug setting changed local port to %d from %d", got, startingPort)
}
knobs.RandomizeClientPort.Store(true)
if err := ue.Reconfig(cfg, routerCfg, &dns.Config{}); err != nil {
t.Fatal(err)
}
if got := ue.magicConn.LocalPort(); got == startingPort {
t.Errorf("debug setting did not change local port from %d", startingPort)
}
lastPort := ue.magicConn.LocalPort()
knobs.RandomizeClientPort.Store(false)
if err := ue.Reconfig(cfg, routerCfg, &dns.Config{}); err != nil {
t.Fatal(err)
}
if startingPort == defaultPort {
// Only try this if we managed to bind defaultPort the first time.
// Otherwise, assume someone else on the computer is using defaultPort
// and so Reconfig would have caused magicSockt to bind some other port.
if got := ue.magicConn.LocalPort(); got != defaultPort {
t.Errorf("debug setting did not change local port from %d to %d", startingPort, defaultPort)
}
}
if got := ue.magicConn.LocalPort(); got == lastPort {
t.Errorf("Reconfig did not change local port from %d", lastPort)
}
}
// Test that enabling and disabling peer path MTU discovery works correctly.
func TestUserspaceEnginePeerMTUReconfig(t *testing.T) {
if runtime.GOOS != "linux" && runtime.GOOS != "darwin" {
t.Skipf("skipping on %q; peer MTU not supported", runtime.GOOS)
}
defer os.Setenv("TS_DEBUG_ENABLE_PMTUD", os.Getenv("TS_DEBUG_ENABLE_PMTUD"))
envknob.Setenv("TS_DEBUG_ENABLE_PMTUD", "")
// Turn on debugging to help diagnose problems.
defer os.Setenv("TS_DEBUG_PMTUD", os.Getenv("TS_DEBUG_PMTUD"))
envknob.Setenv("TS_DEBUG_PMTUD", "true")
var knobs controlknobs.Knobs
bus := eventbustest.NewBus(t)
ht := health.NewTracker(bus)
reg := new(usermetric.Registry)
e, err := NewFakeUserspaceEngine(t.Logf, 0, &knobs, ht, reg, bus)
if err != nil {
t.Fatal(err)
}
t.Cleanup(e.Close)
ue := e.(*userspaceEngine)
if ue.magicConn.PeerMTUEnabled() != false {
t.Error("peer MTU enabled by default, should not be")
}
osDefaultDF, err := ue.magicConn.DontFragSetting()
if err != nil {
t.Errorf("get don't fragment bit failed: %v", err)
}
t.Logf("Info: OS default don't fragment bit(s) setting: %v", osDefaultDF)
// Build a set of configs to use as we change the peer MTU settings.
nodeKey, err := key.ParseNodePublicUntyped(mem.S("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"))
if err != nil {
t.Fatal(err)
}
cfg := &wgcfg.Config{
Peers: []wgcfg.Peer{
{
PublicKey: nodeKey,
AllowedIPs: []netip.Prefix{
netip.PrefixFrom(netaddr.IPv4(100, 100, 99, 1), 32),
},
},
},
}
routerCfg := &router.Config{}
tests := []struct {
desc string // test description
wantP bool // desired value of PMTUD setting
wantDF bool // desired value of don't fragment bits
shouldP opt.Bool // if set, force peer MTU to this value
}{
{desc: "after_first_reconfig", wantP: false, wantDF: osDefaultDF, shouldP: ""},
{desc: "enabling_PMTUD_first_time", wantP: true, wantDF: true, shouldP: "true"},
{desc: "disabling_PMTUD", wantP: false, wantDF: false, shouldP: "false"},
{desc: "enabling_PMTUD_second_time", wantP: true, wantDF: true, shouldP: "true"},
{desc: "returning_to_default_PMTUD", wantP: false, wantDF: false, shouldP: ""},
}
for _, tt := range tests {
t.Run(tt.desc, func(t *testing.T) {
if v, ok := tt.shouldP.Get(); ok {
knobs.PeerMTUEnable.Store(v)
} else {
knobs.PeerMTUEnable.Store(false)
}
if err := ue.Reconfig(cfg, routerCfg, &dns.Config{}); err != nil {
t.Fatal(err)
}
if v := ue.magicConn.PeerMTUEnabled(); v != tt.wantP {
t.Errorf("peer MTU set to %v, want %v", v, tt.wantP)
}
if v, err := ue.magicConn.DontFragSetting(); v != tt.wantDF || err != nil {
t.Errorf("don't fragment bit set to %v, want %v, err %v", v, tt.wantP, err)
}
})
}
}
func TestTSMPKeyAdvertisement(t *testing.T) {
var knobs controlknobs.Knobs
bus := eventbustest.NewBus(t)
ht := health.NewTracker(bus)
reg := new(usermetric.Registry)
e, err := NewFakeUserspaceEngine(t.Logf, 0, &knobs, ht, reg, bus)
if err != nil {
t.Fatal(err)
}
t.Cleanup(e.Close)
ue := e.(*userspaceEngine)
routerCfg := &router.Config{}
nodeKey := nkFromHex("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb")
nm := &netmap.NetworkMap{
Peers: nodeViews([]*tailcfg.Node{
{
ID: 1,
Key: nodeKey,
},
}),
SelfNode: (&tailcfg.Node{
StableID: "TESTCTRL00000001",
Name: "test-node.test.ts.net",
Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32"), netip.MustParsePrefix("fd7a:115c:a1e0:ab12:4843:cd96:0:1/128")},
}).View(),
}
cfg := &wgcfg.Config{
Peers: []wgcfg.Peer{
{
PublicKey: nodeKey,
AllowedIPs: []netip.Prefix{
netip.PrefixFrom(netaddr.IPv4(100, 100, 99, 1), 32),
},
},
},
}
ue.SetNetworkMap(nm)
err = ue.Reconfig(cfg, routerCfg, &dns.Config{})
if err != nil {
t.Fatal(err)
}
addr := netip.MustParseAddr("100.100.99.1")
previousValue := metricTSMPDiscoKeyAdvertisementSent.Value()
ue.sendTSMPDiscoAdvertisement(addr)
if val := metricTSMPDiscoKeyAdvertisementSent.Value(); val <= previousValue {
errs := metricTSMPDiscoKeyAdvertisementError.Value()
t.Errorf("Expected 1 disco key advert, got %d, errors %d", val, errs)
}
// Remove config to have the engine shut down more consistently
err = ue.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{})
if err != nil {
t.Fatal(err)
}
}
func nkFromHex(hex string) key.NodePublic {
if len(hex) != 64 {
panic(fmt.Sprintf("%q is len %d; want 64", hex, len(hex)))
}
k, err := key.ParseNodePublicUntyped(mem.S(hex[:64]))
if err != nil {
panic(fmt.Sprintf("%q is not hex: %v", hex, err))
}
return k
}
// makeMaybeReconfigInputs builds a maybeReconfigInputs with n peers,
// each with a unique key, disco key, and AllowedIPs entry.
func makeMaybeReconfigInputs(n int) *maybeReconfigInputs {
peers := make([]wgcfg.Peer, n)
trimmed := make(map[key.NodePublic]bool, n)
trackNodes := make([]key.NodePublic, n)
trackIPs := make([]netip.Addr, n)
for i := range n {
nk := key.NewNode()
pub := nk.Public()
peers[i] = wgcfg.Peer{
PublicKey: pub,
DiscoKey: key.NewDisco().Public(),
AllowedIPs: []netip.Prefix{netip.PrefixFrom(netip.AddrFrom4([4]byte{100, 64, byte(i >> 8), byte(i)}), 32)},
}
trimmed[pub] = true
trackNodes[i] = pub
trackIPs[i] = netip.AddrFrom4([4]byte{100, 64, byte(i >> 8), byte(i)})
}
return &maybeReconfigInputs{
WGConfig: &wgcfg.Config{
PrivateKey: key.NewNode(),
Peers: peers,
MTU: 1280,
},
TrimmedNodes: trimmed,
TrackNodes: views.SliceOf(trackNodes),
TrackIPs: views.SliceOf(trackIPs),
}
}
func TestMaybeReconfigInputsEqual(t *testing.T) {
a := makeMaybeReconfigInputs(100)
b := a.Clone()
// nil cases
if !(*maybeReconfigInputs)(nil).Equal(nil) {
t.Error("nil.Equal(nil) should be true")
}
if a.Equal(nil) {
t.Error("non-nil.Equal(nil) should be false")
}
if (*maybeReconfigInputs)(nil).Equal(a) {
t.Error("nil.Equal(non-nil) should be false")
}
// same pointer
if !a.Equal(a) {
t.Error("a.Equal(a) should be true")
}
// cloned equal value
if !a.Equal(b) {
t.Error("a.Equal(clone) should be true")
}
// Verify that every field in the struct is covered by Equal.
// Each entry mutates exactly one field of a clone and expects
// Equal to return false. If a new field is added to
// maybeReconfigInputs without a corresponding entry here, the
// field count check below will fail.
type mutator struct {
field string
fn func(c *maybeReconfigInputs)
}
mutators := []mutator{
{"WGConfig", func(c *maybeReconfigInputs) {
c.WGConfig.MTU = 9999
}},
{"TrimmedNodes", func(c *maybeReconfigInputs) {
c.TrimmedNodes[key.NewNode().Public()] = true
}},
{"TrackNodes", func(c *maybeReconfigInputs) {
ns := c.TrackNodes.AsSlice()
ns[0] = key.NewNode().Public()
c.TrackNodes = views.SliceOf(ns)
}},
{"TrackIPs", func(c *maybeReconfigInputs) {
ips := c.TrackIPs.AsSlice()
ips[0] = netip.MustParseAddr("1.2.3.4")
c.TrackIPs = views.SliceOf(ips)
}},
}
// Ensure we have a mutator for every field.
numFields := reflect.TypeOf(maybeReconfigInputs{}).NumField()
if len(mutators) != numFields {
t.Fatalf("maybeReconfigInputs has %d fields but test covers %d; update the mutators table", numFields, len(mutators))
}
for _, m := range mutators {
c := a.Clone()
m.fn(c)
if a.Equal(c) {
t.Errorf("Equal did not detect change in field %s", m.field)
}
}
}
func BenchmarkMaybeReconfigInputsEqual(b *testing.B) {
for _, n := range []int{10, 100, 1000, 5000} {
b.Run(fmt.Sprintf("peers=%d", n), func(b *testing.B) {
a := makeMaybeReconfigInputs(n)
o := a.Clone()
b.ReportAllocs()
b.ResetTimer()
for range b.N {
a.Equal(o)
}
})
}
}
// an experiment to see if genLocalAddrFunc was worth it. As of Go
// 1.16, it still very much is. (30-40x faster)
func BenchmarkGenLocalAddrFunc(b *testing.B) {
la1 := netip.MustParseAddr("1.2.3.4")
la2 := netip.MustParseAddr("::4")
lanot := netip.MustParseAddr("5.5.5.5")
var x bool
b.Run("map1", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
m := map[netip.Addr]bool{
la1: true,
}
for range b.N {
x = m[la1]
x = m[lanot]
}
})
b.Run("map2", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
m := map[netip.Addr]bool{
la1: true,
la2: true,
}
for range b.N {
x = m[la1]
x = m[lanot]
}
})
b.Run("or1", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
f := func(t netip.Addr) bool {
return t == la1
}
for range b.N {
x = f(la1)
x = f(lanot)
}
})
b.Run("or2", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
f := func(t netip.Addr) bool {
return t == la1 || t == la2
}
for range b.N {
x = f(la1)
x = f(lanot)
}
})
b.Logf("x = %v", x)
}