mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-07 00:06:51 -04:00
* feat(distributed): NATS JWT auth, TLS/mTLS options, and e2e coverage Mint per-node NATS user JWTs at registration when LOCALAI_NATS_ACCOUNT_SEED is set, and connect workers with scoped credentials from the register response. Add optional LOCALAI_NATS_TLS_CA/CERT/KEY for private CA and mTLS alongside tls:// URLs, plus test-e2e-distributed and NatsJWT container e2e specs. Document JWT setup (nats-auth-setup.sh) and TLS env vars in distributed-mode. Assisted-by: Grok:grok grok-build Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(distributed): correct NATS JWT scoping and harden client auth The JWT-auth path added in 46467cc7 had several gaps that fail silently under LOCALAI_NATS_REQUIRE_AUTH: - Agent-worker minted JWTs did not allow the subjects the agent worker actually subscribes to (jobs.mcp-ci.new and nodes.<id>.backend.stop), so MCP-CI jobs and backend-stop session cleanup were silently dropped. Scope the agent permission set to those subjects. - NATS subscription permission violations were swallowed (Subscribe returned a live-but-dead subscription). Confirm subscriptions with a server round-trip so a denial surfaces synchronously, and log async permission errors. - The backend worker connected anonymously when given a JWT without its paired seed; reject the unpaired credential instead. - The documented service-user permissions in nats-auth-setup.sh omitted prefixcache.>, which the frontend publishes and subscribes; add it. Also: add a credential-provider hook to the messaging client (consumed by the follow-up credential-lifecycle change), drop the always-nil error from NatsMessagingOptions, run go mod tidy (jwt/v2 and nkeys are now direct), and gofmt the feature's files. Tests: an agent-JWT e2e spec that connects to the enforcing NATS server and exercises every subscription the agent worker makes, plus permission allow-list coverage unit tests. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat(distributed): acquire and auto-refresh worker NATS credentials Workers fetched NATS credentials once at startup, which broke two cases under JWT auth: a worker that registered while still pending admin approval never received a minted JWT (it connected unauthenticated and gave up), and a long-running worker's 24h JWT expired with no way to renew it. Introduce workerregistry.NATSCredentialManager, built on idempotent re-registration (the frontend preserves the node row and mints a fresh JWT each call): - Acquire re-registers through admin approval until the node is approved and credentials are minted (or returns the first success when auth is not required, preserving anonymous-NATS behavior). - RefreshLoop re-registers before the JWT expires (~75% of its lifetime), updating the credentials served to the connection. - Both are bounded (default 100 attempts / consecutive failures) and return an error on exhaustion, so an unapprovable or unrenewable worker exits non-zero and surfaces the problem instead of hanging or drifting toward an expired credential. The messaging client gains WithUserJWTProvider, fetching credentials on each (re)connect so the connection transparently adopts a refreshed JWT when the server expires the old one. RegisterFull exposes the approval status and full response; Register delegates to it. Both the backend worker and the agent worker are wired to this: explicit env credentials are used as-is, minted credentials are acquired-with-wait and refreshed, and a permanent refresh failure shuts the worker down so it restarts and re-acquires. Tests cover Acquire (wait-through-pending, bounded give-up, context cancel), RefreshLoop (refresh-before-expiry, bounded failure, no-expiry exit) and jwtExpiry decoding. Docs updated in distributed-mode.md. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
135 lines
4.7 KiB
Go
135 lines
4.7 KiB
Go
package natsauth_test
|
|
|
|
import (
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"github.com/mudler/LocalAI/core/services/messaging"
|
|
"github.com/mudler/LocalAI/pkg/natsauth"
|
|
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
// subjectMatches implements NATS subject-token matching: "*" matches exactly one
|
|
// token and ">" matches one or more trailing tokens. It lets these tests assert
|
|
// that a permission allow-list (which uses wildcards) actually covers a concrete
|
|
// subject a component publishes/subscribes — the same check the NATS server makes.
|
|
func subjectMatches(pattern, subject string) bool {
|
|
p := strings.Split(pattern, ".")
|
|
s := strings.Split(subject, ".")
|
|
for i, tok := range p {
|
|
if tok == ">" {
|
|
return i < len(s) // ">" must match at least one remaining token
|
|
}
|
|
if i >= len(s) {
|
|
return false
|
|
}
|
|
if tok != "*" && tok != s[i] {
|
|
return false
|
|
}
|
|
}
|
|
return len(p) == len(s)
|
|
}
|
|
|
|
func anyAllows(allow []string, subject string) bool {
|
|
for _, p := range allow {
|
|
if subjectMatches(p, subject) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
var _ = Describe("WorkerPermissions subject coverage", func() {
|
|
// A node ID containing NATS-reserved characters exercises the (duplicated)
|
|
// sanitizer in pkg/natsauth against the canonical one in core/services/messaging.
|
|
// If the two ever diverge, the minted prefix stops matching the real subject
|
|
// and these assertions fail — guarding the copy noted in the review.
|
|
const nodeID = "host.a 1*b"
|
|
|
|
Context("backend worker", func() {
|
|
pub, sub := natsauth.WorkerPermissions(nodeID, "backend")
|
|
|
|
// Every subject core/services/worker/{lifecycle,file_staging}.go subscribes to.
|
|
subscribed := []string{
|
|
messaging.SubjectNodeBackendInstall(nodeID),
|
|
messaging.SubjectNodeBackendUpgrade(nodeID),
|
|
messaging.SubjectNodeBackendStop(nodeID),
|
|
messaging.SubjectNodeBackendDelete(nodeID),
|
|
messaging.SubjectNodeBackendList(nodeID),
|
|
messaging.SubjectNodeModelUnload(nodeID),
|
|
messaging.SubjectNodeModelDelete(nodeID),
|
|
messaging.SubjectNodeStop(nodeID),
|
|
messaging.SubjectNodeFilesEnsure(nodeID),
|
|
messaging.SubjectNodeFilesStage(nodeID),
|
|
messaging.SubjectNodeFilesTemp(nodeID),
|
|
messaging.SubjectNodeFilesListDir(nodeID),
|
|
}
|
|
for _, subject := range subscribed {
|
|
It("allows subscribing to "+subject, func() {
|
|
Expect(anyAllows(sub, subject)).To(BeTrue(),
|
|
"backend JWT sub allow-list %v does not cover %s", sub, subject)
|
|
})
|
|
}
|
|
|
|
It("allows publishing backend.install progress", func() {
|
|
subject := messaging.SubjectNodeBackendInstallProgress(nodeID, "op-123")
|
|
Expect(anyAllows(pub, subject)).To(BeTrue(),
|
|
"backend JWT pub allow-list %v does not cover %s", pub, subject)
|
|
})
|
|
})
|
|
|
|
Context("agent worker", func() {
|
|
// node_type "agent"; subjects from core/cli/agent_worker.go.
|
|
pub, sub := natsauth.WorkerPermissions(nodeID, "agent")
|
|
_ = pub
|
|
|
|
subscribed := []string{
|
|
messaging.SubjectAgentExecute, // dispatcher (default --agent-subject)
|
|
messaging.SubjectMCPToolExecute, // QueueSubscribeReply
|
|
messaging.SubjectMCPDiscovery, // QueueSubscribeReply
|
|
messaging.SubjectMCPCIJobsNew, // QueueSubscribe — jobs.mcp-ci.new
|
|
messaging.SubjectNodeBackendStop(nodeID), // Subscribe — MCP session cleanup
|
|
}
|
|
for _, subject := range subscribed {
|
|
It("allows subscribing to "+subject, func() {
|
|
Expect(anyAllows(sub, subject)).To(BeTrue(),
|
|
"agent JWT sub allow-list %v does not cover %s — the agent worker subscribes to it", sub, subject)
|
|
})
|
|
}
|
|
})
|
|
})
|
|
|
|
var allowPubRe = regexp.MustCompile(`--allow-pub "([^"]*)"`)
|
|
|
|
var _ = Describe("Documented NATS service-user permissions", func() {
|
|
// scripts/nats-auth-setup.sh ships the recommended service (frontend) JWT
|
|
// permissions. They must cover every subject the frontend actually publishes,
|
|
// or prefix-cache sync (and friends) break once LOCALAI_NATS_REQUIRE_AUTH is on.
|
|
const scriptPath = "../../scripts/nats-auth-setup.sh"
|
|
|
|
// Representative subjects the frontend publishes on the control plane.
|
|
// prefixcache.* is emitted by prefixcache.Sync in core/application/distributed.go.
|
|
frontendPublishes := []string{
|
|
messaging.SubjectPrefixCacheObserve,
|
|
messaging.SubjectPrefixCacheInvalidate,
|
|
messaging.SubjectNodeBackendInstall("node-1"),
|
|
messaging.SubjectGalleryProgress("op-1"),
|
|
}
|
|
|
|
It("cover every subject the frontend publishes", func() {
|
|
raw, err := os.ReadFile(scriptPath)
|
|
Expect(err).ToNot(HaveOccurred(), "cannot read %s", scriptPath)
|
|
m := allowPubRe.FindStringSubmatch(string(raw))
|
|
Expect(m).To(HaveLen(2), "no --allow-pub list found in %s", scriptPath)
|
|
allow := strings.Split(m[1], ",")
|
|
|
|
for _, subject := range frontendPublishes {
|
|
Expect(anyAllows(allow, subject)).To(BeTrue(),
|
|
"service-user --allow-pub %v does not cover %s (frontend publishes it)", allow, subject)
|
|
}
|
|
})
|
|
})
|