From 8829516dfdc868d7df2788a2edc176a6d384fe7c Mon Sep 17 00:00:00 2001 From: chaosinthecrd Date: Wed, 1 Apr 2026 13:16:01 +0100 Subject: [PATCH] cmd/k8s-proxy: add auth key renewal support Add auth key reissue handling to k8s-proxy, mirroring containerboot. When the proxy detects an auth failure (login-state health warning or NeedsLogin state), it disconnects from control, signals the operator via the state Secret, waits for a new key, clears stale state, and exits so Kubernetes restarts the pod with the new key. A health watcher goroutine runs alongside ts.Up() to short-circuit the startup timeout on terminal auth failures. --- cmd/k8s-proxy/k8s-proxy.go | 86 +++++++++++++++++++--- cmd/k8s-proxy/kube.go | 144 +++++++++++++++++++++++++++++++++++++ cmd/k8s-proxy/kube_test.go | 91 +++++++++++++++++++++++ 3 files changed, 313 insertions(+), 8 deletions(-) create mode 100644 cmd/k8s-proxy/kube.go create mode 100644 cmd/k8s-proxy/kube_test.go diff --git a/cmd/k8s-proxy/k8s-proxy.go b/cmd/k8s-proxy/k8s-proxy.go index 38a86a5e0..2770b777f 100644 --- a/cmd/k8s-proxy/k8s-proxy.go +++ b/cmd/k8s-proxy/k8s-proxy.go @@ -31,6 +31,7 @@ "k8s.io/utils/strings/slices" "tailscale.com/client/local" "tailscale.com/cmd/k8s-proxy/internal/config" + "tailscale.com/health" "tailscale.com/hostinfo" "tailscale.com/ipn" "tailscale.com/ipn/store" @@ -41,6 +42,7 @@ "tailscale.com/kube/certs" healthz "tailscale.com/kube/health" "tailscale.com/kube/k8s-proxy/conf" + "tailscale.com/kube/kubeclient" "tailscale.com/kube/kubetypes" klc "tailscale.com/kube/localclient" "tailscale.com/kube/metrics" @@ -171,10 +173,31 @@ func run(logger *zap.SugaredLogger) error { // If Pod UID unset, assume we're running outside of a cluster/not managed // by the operator, so no need to set additional state keys. + var kc kubeclient.Client + var stateSecretName string if podUID != "" { if err := state.SetInitialKeys(st, podUID); err != nil { return fmt.Errorf("error setting initial state: %w", err) } + + if cfg.Parsed.State != nil { + if name, ok := strings.CutPrefix(*cfg.Parsed.State, "kube:"); ok { + stateSecretName = name + + kc, err = kubeclient.New(k8sProxyFieldManager) + if err != nil { + return err + } + + var configAuthKey string + if cfg.Parsed.AuthKey != nil { + configAuthKey = *cfg.Parsed.AuthKey + } + if err := resetState(ctx, kc, stateSecretName, podUID, configAuthKey); err != nil { + return fmt.Errorf("error resetting state: %w", err) + } + } + } } var authKey string @@ -197,23 +220,68 @@ func run(logger *zap.SugaredLogger) error { ts.Hostname = *cfg.Parsed.Hostname } - // Make sure we crash loop if Up doesn't complete in reasonable time. - upCtx, upCancel := context.WithTimeout(ctx, time.Minute) - defer upCancel() - if _, err := ts.Up(upCtx); err != nil { - return fmt.Errorf("error starting tailscale server: %w", err) - } - defer ts.Close() lc, err := ts.LocalClient() if err != nil { return fmt.Errorf("error getting local client: %w", err) } - // Setup for updating state keys. + // Make sure we crash loop if Up doesn't complete in reasonable time. + upCtx, upCancel := context.WithTimeout(ctx, 30*time.Second) + defer upCancel() + + // ts.Up() deliberately ignores NeedsLogin because it fires transiently + // during normal auth-key login. We can watch for the login-state health + // warning here though, which only fires on terminal auth failure, and + // cancel early. + go func() { + w, err := lc.WatchIPNBus(upCtx, ipn.NotifyInitialHealthState) + if err != nil { + return + } + defer w.Close() + for { + n, err := w.Next() + if err != nil { + return + } + if n.Health != nil { + if _, ok := n.Health.Warnings[health.LoginStateWarnable.Code]; ok { + upCancel() + return + } + } + } + }() + + if _, err := ts.Up(upCtx); err != nil { + if kc != nil && stateSecretName != "" { + return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger) + } + return err + } + + defer ts.Close() + + reissueCh := make(chan struct{}, 1) if podUID != "" { group.Go(func() error { return state.KeepKeysUpdated(ctx, st, klc.New(lc)) }) + + if kc != nil && stateSecretName != "" { + needsReissue, err := checkInitialAuthState(ctx, lc) + if err != nil { + return fmt.Errorf("error checking initial auth state: %w", err) + } + if needsReissue { + logger.Info("Auth key missing or invalid after startup, requesting new key from operator") + return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger) + } + + group.Go(func() error { + return monitorAuthHealth(ctx, lc, reissueCh, logger) + }) + } } if cfg.Parsed.HealthCheckEnabled.EqualBool(true) || cfg.Parsed.MetricsEnabled.EqualBool(true) { @@ -362,6 +430,8 @@ func run(logger *zap.SugaredLogger) error { } cfgLogger.Infof("Config reloaded") + case <-reissueCh: + return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger) } } } diff --git a/cmd/k8s-proxy/kube.go b/cmd/k8s-proxy/kube.go new file mode 100644 index 000000000..1b2ace5dc --- /dev/null +++ b/cmd/k8s-proxy/kube.go @@ -0,0 +1,144 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !plan9 + +package main + +import ( + "context" + "fmt" + "strings" + "time" + + "go.uber.org/zap" + "tailscale.com/client/local" + "tailscale.com/health" + "tailscale.com/ipn" + "tailscale.com/kube/authkey" + "tailscale.com/kube/k8s-proxy/conf" + "tailscale.com/kube/kubeapi" + "tailscale.com/kube/kubeclient" + "tailscale.com/kube/kubetypes" + "tailscale.com/tailcfg" +) + +const k8sProxyFieldManager = "tailscale-k8s-proxy" + +// resetState clears containerboot/k8s-proxy state from previous runs and sets +// initial values. This ensures the operator doesn't use stale state when a Pod +// is first recreated. +// +// It also clears the reissue_authkey marker if the operator has actioned it +// (i.e., the config now has a different auth key than what was marked for +// reissue). +func resetState(ctx context.Context, kc kubeclient.Client, stateSecretName string, podUID string, configAuthKey string) error { + existingSecret, err := kc.GetSecret(ctx, stateSecretName) + switch { + case kubeclient.IsNotFoundErr(err): + return nil + case err != nil: + return fmt.Errorf("failed to read state Secret %q to reset state: %w", stateSecretName, err) + } + + s := &kubeapi.Secret{ + Data: map[string][]byte{ + kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion), + }, + } + if podUID != "" { + s.Data[kubetypes.KeyPodUID] = []byte(podUID) + } + + // Only clear reissue_authkey if the operator has actioned it. + brokenAuthkey, ok := existingSecret.Data[kubetypes.KeyReissueAuthkey] + if ok && configAuthKey != "" && string(brokenAuthkey) != configAuthKey { + s.Data[kubetypes.KeyReissueAuthkey] = nil + } + + return kc.StrategicMergePatchSecret(ctx, stateSecretName, s, k8sProxyFieldManager) +} + +// checkInitialAuthState checks if the tsnet server is in an auth failure state +// immediately after coming up. Returns true if auth key reissue is needed. +func checkInitialAuthState(ctx context.Context, lc *local.Client) (bool, error) { + status, err := lc.Status(ctx) + if err != nil { + return false, fmt.Errorf("error getting status: %w", err) + } + + if status.BackendState == ipn.NeedsLogin.String() { + return true, nil + } + + // Status.Health is a []string of health warnings. + loginWarnableCode := string(health.LoginStateWarnable.Code) + for _, h := range status.Health { + if strings.Contains(h, loginWarnableCode) { + return true, nil + } + } + + return false, nil +} + +// monitorAuthHealth watches the IPN bus for auth failures and triggers reissue +// when needed. Runs until context is cancelled or auth failure is detected. +func monitorAuthHealth(ctx context.Context, lc *local.Client, reissueCh chan<- struct{}, logger *zap.SugaredLogger) error { + w, err := lc.WatchIPNBus(ctx, ipn.NotifyInitialHealthState) + if err != nil { + return fmt.Errorf("failed to watch IPN bus for auth health: %w", err) + } + defer w.Close() + + for { + n, err := w.Next() + if err != nil { + if err == ctx.Err() { + return nil + } + return err + } + if n.Health != nil { + if _, ok := n.Health.Warnings[health.LoginStateWarnable.Code]; ok { + logger.Info("Auth key failed to authenticate (may be expired or single-use), requesting new key from operator") + select { + case reissueCh <- struct{}{}: + case <-ctx.Done(): + } + return nil + } + } + } +} + +// handleAuthKeyReissue orchestrates the auth key reissue flow: +// 1. Disconnect from control +// 2. Set reissue marker in state Secret +// 3. Wait for operator to provide new key +// 4. Exit cleanly (Kubernetes will restart the pod with the new key) +func handleAuthKeyReissue(ctx context.Context, lc *local.Client, kc kubeclient.Client, stateSecretName string, currentAuthKey string, cfgChan <-chan *conf.Config, logger *zap.SugaredLogger) error { + if err := lc.DisconnectControl(ctx); err != nil { + return fmt.Errorf("error disconnecting from control: %w", err) + } + if err := authkey.SetReissueAuthKey(ctx, kc, stateSecretName, currentAuthKey, k8sProxyFieldManager); err != nil { + return fmt.Errorf("failed to set reissue_authkey in Kubernetes Secret: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) + defer cancel() + for { + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for auth key reissue") + case cfg := <-cfgChan: + if cfg.Parsed.AuthKey != nil && *cfg.Parsed.AuthKey != currentAuthKey { + if err := authkey.ClearReissueAuthKey(ctx, kc, stateSecretName, k8sProxyFieldManager); err != nil { + logger.Warnf("failed to clear reissue request: %v", err) + } + logger.Info("Successfully received new auth key, restarting to apply configuration") + return nil + } + } + } +} diff --git a/cmd/k8s-proxy/kube_test.go b/cmd/k8s-proxy/kube_test.go new file mode 100644 index 000000000..5a9be86fa --- /dev/null +++ b/cmd/k8s-proxy/kube_test.go @@ -0,0 +1,91 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !plan9 + +package main + +import ( + "context" + "fmt" + "testing" + + "github.com/google/go-cmp/cmp" + "tailscale.com/kube/kubeapi" + "tailscale.com/kube/kubeclient" + "tailscale.com/kube/kubetypes" + "tailscale.com/tailcfg" +) + +func TestResetState(t *testing.T) { + tests := []struct { + name string + existingData map[string][]byte + podUID string + configAuthKey string + wantPatched map[string][]byte + }{ + { + name: "sets_capver_and_pod_uid", + existingData: map[string][]byte{ + kubetypes.KeyDeviceID: []byte("device-123"), + kubetypes.KeyDeviceFQDN: []byte("node.tailnet"), + kubetypes.KeyDeviceIPs: []byte(`["100.64.0.1"]`), + }, + podUID: "pod-123", + configAuthKey: "new-key", + wantPatched: map[string][]byte{ + kubetypes.KeyPodUID: []byte("pod-123"), + }, + }, + { + name: "clears_reissue_marker_when_actioned", + existingData: map[string][]byte{ + kubetypes.KeyReissueAuthkey: []byte("old-key"), + }, + podUID: "pod-123", + configAuthKey: "new-key", + wantPatched: map[string][]byte{ + kubetypes.KeyPodUID: []byte("pod-123"), + kubetypes.KeyReissueAuthkey: nil, + }, + }, + { + name: "keeps_reissue_marker_when_not_actioned", + existingData: map[string][]byte{ + kubetypes.KeyReissueAuthkey: []byte("old-key"), + }, + podUID: "pod-123", + configAuthKey: "old-key", + wantPatched: map[string][]byte{ + kubetypes.KeyPodUID: []byte("pod-123"), + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.wantPatched[kubetypes.KeyCapVer] = fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion) + + var patched map[string][]byte + kc := &kubeclient.FakeClient{ + GetSecretImpl: func(ctx context.Context, name string) (*kubeapi.Secret, error) { + return &kubeapi.Secret{Data: tt.existingData}, nil + }, + StrategicMergePatchSecretImpl: func(ctx context.Context, name string, s *kubeapi.Secret, fm string) error { + patched = s.Data + return nil + }, + } + + err := resetState(context.Background(), kc, "test-secret", tt.podUID, tt.configAuthKey) + if err != nil { + t.Fatalf("resetState() error = %v", err) + } + + if diff := cmp.Diff(tt.wantPatched, patched); diff != "" { + t.Errorf("resetState() mismatch (-want +got):\n%s", diff) + } + }) + } +}