mirror of
https://github.com/tailscale/tailscale.git
synced 2026-04-05 07:03:43 -04:00
wgengine: delay disco key exchange when control is unavailable
Instead of addig a delay to the exchange of disco keys to avoid a race with the wireguard handshake, look for the handshake itself and ensure that we do not start the exchange when the handshake is ongoing. We do not have a direct way of knowing that a handshake is ongoing, only how many are currently in flight or have failed. That makes the polling less straight forward. Ideally, we would have wg tell us when a handshakes is ongoing so we can avoid polling for it. Updates tailscale/corp#34037 Signed-off-by: Claus Lensbøl <claus@tailscale.com>
This commit is contained in:
@@ -569,16 +569,49 @@ func NewUserspaceEngine(logf logger.Logf, conf Config) (_ Engine, reterr error)
|
||||
var tsmpRequestGroup singleflight.Group[netip.Addr, struct{}]
|
||||
eventbus.SubscribeFunc(ec, func(req magicsock.TSMPDiscoKeyRequest) {
|
||||
go tsmpRequestGroup.Do(req.DstIP, func() (struct{}, error) {
|
||||
// DiscoKeyRequests are triggered by an incoming WireGuard handshake
|
||||
// initiation arriving before a disco ping, which is a likely
|
||||
// indicator that disco pings failed due to a lack of key
|
||||
// synchronization. If the requests are sent immediately, before the
|
||||
// handshake state is accepted in the WireGuard client state
|
||||
// machine, this starts a new session, and the two peer state
|
||||
// machines conflict, causing loss and additional delays. Delaying
|
||||
// the send avoids this, so coalesce duplicate sends, and delay them
|
||||
// by a short time to avoid the state machine conflict.
|
||||
time.Sleep(time.Millisecond)
|
||||
nodePeer, ok := e.PeerForIP(req.DstIP)
|
||||
if !ok {
|
||||
return struct{}{}, fmt.Errorf("did not find peer by IP %q", req.DstIP)
|
||||
}
|
||||
peer, ok := e.PeerByKey(nodePeer.Node.Key())
|
||||
if !ok {
|
||||
return struct{}{}, fmt.Errorf("did not find peer by key %q", nodePeer.Node.Key())
|
||||
}
|
||||
peer.IsValid()
|
||||
|
||||
// Poll for handshake completion with a timeout.
|
||||
const pollInterval = 10 * time.Millisecond
|
||||
const maxWaitStart = 100 * time.Microsecond
|
||||
ctxStart, cancelStart := context.WithTimeout(context.Background(), maxWaitStart)
|
||||
defer cancelStart()
|
||||
|
||||
sawHandshake := true
|
||||
// Wait for the handshake to be in-progress.
|
||||
e.logf("Looking for magicsock handshake")
|
||||
for peer.HandshakeAttempts() == 0 {
|
||||
if ctxStart.Err() != nil {
|
||||
// Timeout waiting for handshake to start, send TSMP package.
|
||||
sawHandshake = false
|
||||
break
|
||||
}
|
||||
time.Sleep(pollInterval)
|
||||
}
|
||||
e.logf("Found magicsock handshake: %t", sawHandshake)
|
||||
|
||||
const maxWaitComplete = 2 * time.Second
|
||||
ctx, cancel := context.WithTimeout(context.Background(), maxWaitComplete)
|
||||
defer cancel()
|
||||
// Wait for the in-progress handshake to complete.
|
||||
e.logf("Waiting for magicsock handshake to complete")
|
||||
for sawHandshake && peer.HandshakeAttempts() > 0 {
|
||||
if ctx.Err() != nil {
|
||||
// Timeout waiting for completion. The handshake is stuck. Abort.
|
||||
e.logf("Timed out waiting for magicsock handshake to complete")
|
||||
return struct{}{}, errors.New("timeout waiting for handshake to complete")
|
||||
}
|
||||
time.Sleep(pollInterval)
|
||||
}
|
||||
|
||||
if err := e.sendTSMPDiscoKeyRequest(req.DstIP); err != nil {
|
||||
e.logf("wgengine: failed to send TSMP disco key request: %v", err)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user