ssh/tailssh: fix race between termination message write and session teardown

When a recording upload fails mid-session, killProcessOnContextDone
writes the termination message to ss.Stderr() and kills the process.
Meanwhile, run() takes the ss.ctx.Done() path and proceeds to
ss.Exit(), which tears down the SSH channel. The termination message
write races with the channel teardown, so the client sometimes never
receives it.

Fix by adding an exitHandled channel that killProcessOnContextDone
closes when done. run() now waits on this channel after ctx.Done()
fires, ensuring the termination message is fully written before
the SSH channel is torn down.

Fixes #7707

Change-Id: Ib60116c928d3af46d553a4186a72963c2c731e3e
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick
2026-03-09 15:47:33 +00:00
committed by Brad Fitzpatrick
parent a4614d7d17
commit 633e892164
2 changed files with 12 additions and 6 deletions

View File

@@ -712,6 +712,12 @@ type sshSession struct {
// We use this sync.Once to ensure that we only terminate the process once,
// either it exits itself or is terminated
exitOnce sync.Once
// exitHandled is closed when killProcessOnContextDone finishes writing any
// termination message to the client. run() waits on this before calling
// ss.Exit to ensure the message is flushed before the SSH channel is torn
// down. It is initialized by run() before starting killProcessOnContextDone.
exitHandled chan struct{}
}
func (ss *sshSession) vlogf(format string, args ...any) {
@@ -807,6 +813,7 @@ func (c *conn) fetchSSHAction(ctx context.Context, url string) (*tailcfg.SSHActi
// killProcessOnContextDone waits for ss.ctx to be done and kills the process,
// unless the process has already exited.
func (ss *sshSession) killProcessOnContextDone() {
defer close(ss.exitHandled)
<-ss.ctx.Done()
// Either the process has already exited, in which case this does nothing.
// Or, the process is still running in which case this will kill it.
@@ -987,6 +994,7 @@ func (ss *sshSession) run() {
ss.Exit(1)
return
}
ss.exitHandled = make(chan struct{})
go ss.killProcessOnContextDone()
var processDone atomic.Bool
@@ -1049,6 +1057,10 @@ func (ss *sshSession) run() {
select {
case <-outputDone:
case <-ss.ctx.Done():
// Wait for killProcessOnContextDone to finish writing any
// termination message to the client before we call ss.Exit,
// which tears down the SSH channel.
<-ss.exitHandled
}
if err == nil {

View File

@@ -37,7 +37,6 @@
gossh "golang.org/x/crypto/ssh"
"golang.org/x/net/http2"
"golang.org/x/net/http2/h2c"
"tailscale.com/cmd/testwrapper/flakytest"
"tailscale.com/ipn/ipnlocal"
"tailscale.com/ipn/store/mem"
"tailscale.com/net/memnet"
@@ -490,14 +489,9 @@ func newSSHRule(action *tailcfg.SSHAction) *tailcfg.SSHRule {
}
func TestSSHRecordingCancelsSessionsOnUploadFailure(t *testing.T) {
flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/7707")
if runtime.GOOS != "linux" && runtime.GOOS != "darwin" {
t.Skipf("skipping on %q; only runs on linux and darwin", runtime.GOOS)
}
if runtime.GOOS == "darwin" && cibuild.On() {
t.Skipf("this fails on CI on macOS; see https://github.com/tailscale/tailscale/issues/7707")
}
var handler http.HandlerFunc
recordingServer := mockRecordingServer(t, func(w http.ResponseWriter, r *http.Request) {