mirror of
https://github.com/tailscale/tailscale.git
synced 2026-03-25 09:42:39 -04:00
ssh/tailssh: fix race between termination message write and session teardown
When a recording upload fails mid-session, killProcessOnContextDone writes the termination message to ss.Stderr() and kills the process. Meanwhile, run() takes the ss.ctx.Done() path and proceeds to ss.Exit(), which tears down the SSH channel. The termination message write races with the channel teardown, so the client sometimes never receives it. Fix by adding an exitHandled channel that killProcessOnContextDone closes when done. run() now waits on this channel after ctx.Done() fires, ensuring the termination message is fully written before the SSH channel is torn down. Fixes #7707 Change-Id: Ib60116c928d3af46d553a4186a72963c2c731e3e Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
committed by
Brad Fitzpatrick
parent
a4614d7d17
commit
633e892164
@@ -712,6 +712,12 @@ type sshSession struct {
|
||||
// We use this sync.Once to ensure that we only terminate the process once,
|
||||
// either it exits itself or is terminated
|
||||
exitOnce sync.Once
|
||||
|
||||
// exitHandled is closed when killProcessOnContextDone finishes writing any
|
||||
// termination message to the client. run() waits on this before calling
|
||||
// ss.Exit to ensure the message is flushed before the SSH channel is torn
|
||||
// down. It is initialized by run() before starting killProcessOnContextDone.
|
||||
exitHandled chan struct{}
|
||||
}
|
||||
|
||||
func (ss *sshSession) vlogf(format string, args ...any) {
|
||||
@@ -807,6 +813,7 @@ func (c *conn) fetchSSHAction(ctx context.Context, url string) (*tailcfg.SSHActi
|
||||
// killProcessOnContextDone waits for ss.ctx to be done and kills the process,
|
||||
// unless the process has already exited.
|
||||
func (ss *sshSession) killProcessOnContextDone() {
|
||||
defer close(ss.exitHandled)
|
||||
<-ss.ctx.Done()
|
||||
// Either the process has already exited, in which case this does nothing.
|
||||
// Or, the process is still running in which case this will kill it.
|
||||
@@ -987,6 +994,7 @@ func (ss *sshSession) run() {
|
||||
ss.Exit(1)
|
||||
return
|
||||
}
|
||||
ss.exitHandled = make(chan struct{})
|
||||
go ss.killProcessOnContextDone()
|
||||
|
||||
var processDone atomic.Bool
|
||||
@@ -1049,6 +1057,10 @@ func (ss *sshSession) run() {
|
||||
select {
|
||||
case <-outputDone:
|
||||
case <-ss.ctx.Done():
|
||||
// Wait for killProcessOnContextDone to finish writing any
|
||||
// termination message to the client before we call ss.Exit,
|
||||
// which tears down the SSH channel.
|
||||
<-ss.exitHandled
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
|
||||
@@ -37,7 +37,6 @@
|
||||
gossh "golang.org/x/crypto/ssh"
|
||||
"golang.org/x/net/http2"
|
||||
"golang.org/x/net/http2/h2c"
|
||||
"tailscale.com/cmd/testwrapper/flakytest"
|
||||
"tailscale.com/ipn/ipnlocal"
|
||||
"tailscale.com/ipn/store/mem"
|
||||
"tailscale.com/net/memnet"
|
||||
@@ -490,14 +489,9 @@ func newSSHRule(action *tailcfg.SSHAction) *tailcfg.SSHRule {
|
||||
}
|
||||
|
||||
func TestSSHRecordingCancelsSessionsOnUploadFailure(t *testing.T) {
|
||||
flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/7707")
|
||||
|
||||
if runtime.GOOS != "linux" && runtime.GOOS != "darwin" {
|
||||
t.Skipf("skipping on %q; only runs on linux and darwin", runtime.GOOS)
|
||||
}
|
||||
if runtime.GOOS == "darwin" && cibuild.On() {
|
||||
t.Skipf("this fails on CI on macOS; see https://github.com/tailscale/tailscale/issues/7707")
|
||||
}
|
||||
|
||||
var handler http.HandlerFunc
|
||||
recordingServer := mockRecordingServer(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
Reference in New Issue
Block a user