From 633e892164596bfc69bf064e00a139538c3e2b91 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Mon, 9 Mar 2026 15:47:33 +0000 Subject: [PATCH] ssh/tailssh: fix race between termination message write and session teardown When a recording upload fails mid-session, killProcessOnContextDone writes the termination message to ss.Stderr() and kills the process. Meanwhile, run() takes the ss.ctx.Done() path and proceeds to ss.Exit(), which tears down the SSH channel. The termination message write races with the channel teardown, so the client sometimes never receives it. Fix by adding an exitHandled channel that killProcessOnContextDone closes when done. run() now waits on this channel after ctx.Done() fires, ensuring the termination message is fully written before the SSH channel is torn down. Fixes #7707 Change-Id: Ib60116c928d3af46d553a4186a72963c2c731e3e Signed-off-by: Brad Fitzpatrick --- ssh/tailssh/tailssh.go | 12 ++++++++++++ ssh/tailssh/tailssh_test.go | 6 ------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/ssh/tailssh/tailssh.go b/ssh/tailssh/tailssh.go index debad2b5c..96f9c826c 100644 --- a/ssh/tailssh/tailssh.go +++ b/ssh/tailssh/tailssh.go @@ -712,6 +712,12 @@ type sshSession struct { // We use this sync.Once to ensure that we only terminate the process once, // either it exits itself or is terminated exitOnce sync.Once + + // exitHandled is closed when killProcessOnContextDone finishes writing any + // termination message to the client. run() waits on this before calling + // ss.Exit to ensure the message is flushed before the SSH channel is torn + // down. It is initialized by run() before starting killProcessOnContextDone. + exitHandled chan struct{} } func (ss *sshSession) vlogf(format string, args ...any) { @@ -807,6 +813,7 @@ func (c *conn) fetchSSHAction(ctx context.Context, url string) (*tailcfg.SSHActi // killProcessOnContextDone waits for ss.ctx to be done and kills the process, // unless the process has already exited. func (ss *sshSession) killProcessOnContextDone() { + defer close(ss.exitHandled) <-ss.ctx.Done() // Either the process has already exited, in which case this does nothing. // Or, the process is still running in which case this will kill it. @@ -987,6 +994,7 @@ func (ss *sshSession) run() { ss.Exit(1) return } + ss.exitHandled = make(chan struct{}) go ss.killProcessOnContextDone() var processDone atomic.Bool @@ -1049,6 +1057,10 @@ func (ss *sshSession) run() { select { case <-outputDone: case <-ss.ctx.Done(): + // Wait for killProcessOnContextDone to finish writing any + // termination message to the client before we call ss.Exit, + // which tears down the SSH channel. + <-ss.exitHandled } if err == nil { diff --git a/ssh/tailssh/tailssh_test.go b/ssh/tailssh/tailssh_test.go index 4d6f2172d..ec5774616 100644 --- a/ssh/tailssh/tailssh_test.go +++ b/ssh/tailssh/tailssh_test.go @@ -37,7 +37,6 @@ gossh "golang.org/x/crypto/ssh" "golang.org/x/net/http2" "golang.org/x/net/http2/h2c" - "tailscale.com/cmd/testwrapper/flakytest" "tailscale.com/ipn/ipnlocal" "tailscale.com/ipn/store/mem" "tailscale.com/net/memnet" @@ -490,14 +489,9 @@ func newSSHRule(action *tailcfg.SSHAction) *tailcfg.SSHRule { } func TestSSHRecordingCancelsSessionsOnUploadFailure(t *testing.T) { - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/7707") - if runtime.GOOS != "linux" && runtime.GOOS != "darwin" { t.Skipf("skipping on %q; only runs on linux and darwin", runtime.GOOS) } - if runtime.GOOS == "darwin" && cibuild.On() { - t.Skipf("this fails on CI on macOS; see https://github.com/tailscale/tailscale/issues/7707") - } var handler http.HandlerFunc recordingServer := mockRecordingServer(t, func(w http.ResponseWriter, r *http.Request) {