mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-05 15:26:14 -04:00
* feat(distributed): NATS JWT auth, TLS/mTLS options, and e2e coverage Mint per-node NATS user JWTs at registration when LOCALAI_NATS_ACCOUNT_SEED is set, and connect workers with scoped credentials from the register response. Add optional LOCALAI_NATS_TLS_CA/CERT/KEY for private CA and mTLS alongside tls:// URLs, plus test-e2e-distributed and NatsJWT container e2e specs. Document JWT setup (nats-auth-setup.sh) and TLS env vars in distributed-mode. Assisted-by: Grok:grok grok-build Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(distributed): correct NATS JWT scoping and harden client auth The JWT-auth path added in 46467cc7 had several gaps that fail silently under LOCALAI_NATS_REQUIRE_AUTH: - Agent-worker minted JWTs did not allow the subjects the agent worker actually subscribes to (jobs.mcp-ci.new and nodes.<id>.backend.stop), so MCP-CI jobs and backend-stop session cleanup were silently dropped. Scope the agent permission set to those subjects. - NATS subscription permission violations were swallowed (Subscribe returned a live-but-dead subscription). Confirm subscriptions with a server round-trip so a denial surfaces synchronously, and log async permission errors. - The backend worker connected anonymously when given a JWT without its paired seed; reject the unpaired credential instead. - The documented service-user permissions in nats-auth-setup.sh omitted prefixcache.>, which the frontend publishes and subscribes; add it. Also: add a credential-provider hook to the messaging client (consumed by the follow-up credential-lifecycle change), drop the always-nil error from NatsMessagingOptions, run go mod tidy (jwt/v2 and nkeys are now direct), and gofmt the feature's files. Tests: an agent-JWT e2e spec that connects to the enforcing NATS server and exercises every subscription the agent worker makes, plus permission allow-list coverage unit tests. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat(distributed): acquire and auto-refresh worker NATS credentials Workers fetched NATS credentials once at startup, which broke two cases under JWT auth: a worker that registered while still pending admin approval never received a minted JWT (it connected unauthenticated and gave up), and a long-running worker's 24h JWT expired with no way to renew it. Introduce workerregistry.NATSCredentialManager, built on idempotent re-registration (the frontend preserves the node row and mints a fresh JWT each call): - Acquire re-registers through admin approval until the node is approved and credentials are minted (or returns the first success when auth is not required, preserving anonymous-NATS behavior). - RefreshLoop re-registers before the JWT expires (~75% of its lifetime), updating the credentials served to the connection. - Both are bounded (default 100 attempts / consecutive failures) and return an error on exhaustion, so an unapprovable or unrenewable worker exits non-zero and surfaces the problem instead of hanging or drifting toward an expired credential. The messaging client gains WithUserJWTProvider, fetching credentials on each (re)connect so the connection transparently adopts a refreshed JWT when the server expires the old one. RegisterFull exposes the approval status and full response; Register delegates to it. Both the backend worker and the agent worker are wired to this: explicit env credentials are used as-is, minted credentials are acquired-with-wait and refreshed, and a permanent refresh failure shuts the worker down so it restarts and re-acquires. Tests cover Acquire (wait-through-pending, bounded give-up, context cancel), RefreshLoop (refresh-before-expiry, bounded failure, no-expiry exit) and jwtExpiry decoding. Docs updated in distributed-mode.md. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
201 lines
7.2 KiB
Go
201 lines
7.2 KiB
Go
package workerregistry
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/mudler/LocalAI/pkg/natsauth"
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
// statusPending mirrors nodes.StatusPending. It is duplicated rather than
|
|
// imported so the lightweight registration client does not pull in the nodes
|
|
// package (and its gorm/DB dependencies).
|
|
const statusPending = "pending"
|
|
|
|
// defaultMaxAttempts bounds how many times Acquire registers (and how many
|
|
// consecutive times RefreshLoop may fail) before giving up. It is high enough
|
|
// to ride out a slow admin approval or a transient frontend outage, but finite
|
|
// so an unauthorized/unapprovable worker exits and surfaces the problem (via a
|
|
// non-zero exit and the resulting restart) rather than waiting forever.
|
|
const defaultMaxAttempts = 100
|
|
|
|
// RegisterFunc performs one idempotent registration round-trip.
|
|
type RegisterFunc func(ctx context.Context) (*RegisterResponse, error)
|
|
|
|
// NATSCredentialManager acquires NATS credentials at startup — waiting through
|
|
// admin approval when required — and refreshes them before the minted JWT
|
|
// expires, by re-registering (which mints a fresh JWT). The live NATS
|
|
// connection adopts a refreshed JWT on its next reconnect via Provider. Safe
|
|
// for concurrent use.
|
|
//
|
|
// It addresses two failure modes: a worker that needs credentials but registers
|
|
// while still pending approval (it would otherwise give up and never connect),
|
|
// and a long-running worker whose 24h JWT expires with no way to renew it.
|
|
type NATSCredentialManager struct {
|
|
register RegisterFunc
|
|
requireCreds bool // block until credentials are present (frontend minting in use)
|
|
|
|
// Tunables; defaults set by NewNATSCredentialManager, overridable in tests.
|
|
initialBackoff time.Duration
|
|
maxBackoff time.Duration
|
|
maxAttempts int // bound on Acquire attempts / consecutive refresh failures (<=0 = unlimited)
|
|
refreshLead float64 // refresh once this fraction of the JWT lifetime has elapsed
|
|
refreshRetry time.Duration
|
|
expiryOf func(jwt string) (time.Time, bool)
|
|
|
|
mu sync.RWMutex
|
|
jwt string
|
|
seed string
|
|
nodeID string
|
|
}
|
|
|
|
// NewNATSCredentialManager builds a manager over register. When requireCreds is
|
|
// true, Acquire blocks until the node is approved and credentials are minted.
|
|
func NewNATSCredentialManager(register RegisterFunc, requireCreds bool) *NATSCredentialManager {
|
|
return &NATSCredentialManager{
|
|
register: register,
|
|
requireCreds: requireCreds,
|
|
initialBackoff: 2 * time.Second,
|
|
maxBackoff: 30 * time.Second,
|
|
maxAttempts: defaultMaxAttempts,
|
|
refreshLead: 0.75,
|
|
refreshRetry: 30 * time.Second,
|
|
expiryOf: jwtExpiry,
|
|
}
|
|
}
|
|
|
|
// jwtExpiry decodes the expiry of a minted user JWT. ok is false when the token
|
|
// is empty/undecodable or carries no expiry (e.g. a non-expiring service JWT).
|
|
func jwtExpiry(token string) (time.Time, bool) {
|
|
if token == "" {
|
|
return time.Time{}, false
|
|
}
|
|
uc, err := natsauth.DecodeUserClaims(token)
|
|
if err != nil || uc.Expires == 0 {
|
|
return time.Time{}, false
|
|
}
|
|
return time.Unix(uc.Expires, 0), true
|
|
}
|
|
|
|
func (m *NATSCredentialManager) store(res *RegisterResponse) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.nodeID = res.ID
|
|
if res.NatsJWT != "" && res.NatsUserSeed != "" {
|
|
m.jwt, m.seed = res.NatsJWT, res.NatsUserSeed
|
|
}
|
|
}
|
|
|
|
// Current returns the latest NATS credentials (both empty until acquired).
|
|
func (m *NATSCredentialManager) Current() (jwt, seed string) {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
return m.jwt, m.seed
|
|
}
|
|
|
|
// NodeID returns the node ID from the most recent registration.
|
|
func (m *NATSCredentialManager) NodeID() string {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
return m.nodeID
|
|
}
|
|
|
|
// Provider returns a callback compatible with messaging.WithUserJWTProvider,
|
|
// supplying the current credentials on each (re)connect.
|
|
func (m *NATSCredentialManager) Provider() func() (string, string) {
|
|
return m.Current
|
|
}
|
|
|
|
// HasCredentials reports whether complete NATS credentials have been obtained.
|
|
func (m *NATSCredentialManager) HasCredentials() bool {
|
|
jwt, seed := m.Current()
|
|
return jwt != "" && seed != ""
|
|
}
|
|
|
|
// Acquire registers and, when requireCreds is set, keeps re-registering with
|
|
// exponential backoff until the node is approved (status != pending) and
|
|
// credentials are minted. Without requireCreds it returns the first successful
|
|
// response (the historical one-shot behavior, preserved for anonymous NATS).
|
|
func (m *NATSCredentialManager) Acquire(ctx context.Context) (*RegisterResponse, error) {
|
|
backoff := m.initialBackoff
|
|
var lastReason error
|
|
for attempt := 1; m.maxAttempts <= 0 || attempt <= m.maxAttempts; attempt++ {
|
|
res, err := m.register(ctx)
|
|
switch {
|
|
case err != nil:
|
|
lastReason = err
|
|
xlog.Warn("Registration failed, retrying", "attempt", attempt, "next_retry", backoff, "error", err)
|
|
case !m.requireCreds:
|
|
m.store(res)
|
|
return res, nil
|
|
case res.Status == statusPending:
|
|
lastReason = fmt.Errorf("node %s still pending admin approval", res.ID)
|
|
xlog.Info("Node pending admin approval; waiting", "node", res.ID, "attempt", attempt, "next_retry", backoff)
|
|
case res.NatsJWT == "" || res.NatsUserSeed == "":
|
|
lastReason = fmt.Errorf("node %s approved but NATS credentials not minted", res.ID)
|
|
xlog.Info("Node approved but NATS credentials not yet minted; waiting", "node", res.ID, "attempt", attempt, "next_retry", backoff)
|
|
default:
|
|
m.store(res)
|
|
return res, nil
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil, ctx.Err()
|
|
case <-time.After(backoff):
|
|
}
|
|
backoff = min(backoff*2, m.maxBackoff)
|
|
}
|
|
return nil, fmt.Errorf("giving up acquiring NATS credentials after %d attempts: %w", m.maxAttempts, lastReason)
|
|
}
|
|
|
|
// RefreshLoop re-registers to mint a fresh JWT before the current one expires,
|
|
// updating the credentials returned by Current/Provider so the NATS connection
|
|
// adopts them on its next reconnect. It returns nil when ctx is cancelled or
|
|
// when the current credential has no expiry (nothing to refresh), and a non-nil
|
|
// error after maxAttempts consecutive refresh failures — letting the caller
|
|
// exit the worker so it restarts and re-acquires (or surfaces the outage)
|
|
// rather than silently drifting toward an expired, unrenewable JWT.
|
|
func (m *NATSCredentialManager) RefreshLoop(ctx context.Context) error {
|
|
failures := 0
|
|
for {
|
|
jwt, _ := m.Current()
|
|
exp, ok := m.expiryOf(jwt)
|
|
if !ok {
|
|
xlog.Debug("NATS credential has no expiry; refresh loop exiting")
|
|
return nil
|
|
}
|
|
wait := max(time.Duration(float64(time.Until(exp))*m.refreshLead), 0)
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
case <-time.After(wait):
|
|
}
|
|
|
|
res, err := m.register(ctx)
|
|
if err == nil && res.NatsJWT != "" && res.NatsUserSeed != "" {
|
|
m.store(res)
|
|
failures = 0
|
|
xlog.Info("Refreshed NATS credentials", "node", res.ID)
|
|
continue
|
|
}
|
|
failures++
|
|
if err != nil {
|
|
xlog.Warn("NATS credential refresh failed; will retry", "attempt", failures, "error", err)
|
|
} else {
|
|
xlog.Warn("NATS credential refresh returned no credentials; will retry", "attempt", failures)
|
|
}
|
|
if m.maxAttempts > 0 && failures >= m.maxAttempts {
|
|
return fmt.Errorf("NATS credential refresh failed %d times in a row", failures)
|
|
}
|
|
// Back off before retrying so a persistent failure near expiry does not spin.
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
case <-time.After(m.refreshRetry):
|
|
}
|
|
}
|
|
}
|