feat(distributed): add configurable NATS backend install/upgrade timeouts

Adds BackendInstallTimeout and BackendUpgradeTimeout to DistributedConfig
with 15m defaults, following the existing MCPToolTimeout / WorkerWaitTimeout
pattern. These will replace the hardcoded literals in RemoteUnloaderAdapter
so admin-driven backend installs across the cluster survive long OCI image
pulls that previously timed out at 3m.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-05-22 19:39:54 +00:00
parent 61bf34ea2f
commit 0e2b84d8e3
2 changed files with 65 additions and 0 deletions

View File

@@ -42,6 +42,9 @@ type DistributedConfig struct {
MCPCIJobTimeout time.Duration // MCP CI job execution timeout (default 10m)
BackendInstallTimeout time.Duration // NATS round-trip timeout for backend.install (default 15m)
BackendUpgradeTimeout time.Duration // NATS round-trip timeout for backend.upgrade (default 15m)
MaxUploadSize int64 // Maximum upload body size in bytes (default 50 GB)
AgentWorkerConcurrency int `yaml:"agent_worker_concurrency" json:"agent_worker_concurrency" env:"LOCALAI_AGENT_WORKER_CONCURRENCY"`
@@ -75,6 +78,8 @@ func (c DistributedConfig) Validate() error {
"health-check-interval": c.HealthCheckInterval,
"stale-node-threshold": c.StaleNodeThreshold,
"mcp-ci-job-timeout": c.MCPCIJobTimeout,
"backend-install-timeout": c.BackendInstallTimeout,
"backend-upgrade-timeout": c.BackendUpgradeTimeout,
} {
if d < 0 {
return fmt.Errorf("%s must not be negative", name)
@@ -137,6 +142,18 @@ func WithStorageSecretKey(key string) AppOption {
}
}
func WithBackendInstallTimeout(d time.Duration) AppOption {
return func(o *ApplicationConfig) {
o.Distributed.BackendInstallTimeout = d
}
}
func WithBackendUpgradeTimeout(d time.Duration) AppOption {
return func(o *ApplicationConfig) {
o.Distributed.BackendUpgradeTimeout = d
}
}
var EnableAutoApproveNodes = func(o *ApplicationConfig) {
o.Distributed.AutoApproveNodes = true
}
@@ -150,11 +167,23 @@ const (
DefaultHealthCheckInterval = 15 * time.Second
DefaultStaleNodeThreshold = 60 * time.Second
DefaultMCPCIJobTimeout = 10 * time.Minute
DefaultBackendInstallTimeout = 15 * time.Minute
DefaultBackendUpgradeTimeout = 15 * time.Minute
)
// DefaultMaxUploadSize is the default maximum upload body size (50 GB).
const DefaultMaxUploadSize int64 = 50 << 30
// BackendInstallTimeoutOrDefault returns the configured timeout or the default.
func (c DistributedConfig) BackendInstallTimeoutOrDefault() time.Duration {
return cmp.Or(c.BackendInstallTimeout, DefaultBackendInstallTimeout)
}
// BackendUpgradeTimeoutOrDefault returns the configured timeout or the default.
func (c DistributedConfig) BackendUpgradeTimeoutOrDefault() time.Duration {
return cmp.Or(c.BackendUpgradeTimeout, DefaultBackendUpgradeTimeout)
}
// MCPToolTimeoutOrDefault returns the configured timeout or the default.
func (c DistributedConfig) MCPToolTimeoutOrDefault() time.Duration {
return cmp.Or(c.MCPToolTimeout, DefaultMCPToolTimeout)

View File

@@ -0,0 +1,36 @@
package config_test
import (
"time"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/mudler/LocalAI/core/config"
)
var _ = Describe("DistributedConfig backend NATS timeouts", func() {
Context("BackendInstallTimeoutOrDefault", func() {
It("returns 15 minutes when unset", func() {
c := config.DistributedConfig{}
Expect(c.BackendInstallTimeoutOrDefault()).To(Equal(15 * time.Minute))
})
It("returns the configured value when set", func() {
c := config.DistributedConfig{BackendInstallTimeout: 42 * time.Minute}
Expect(c.BackendInstallTimeoutOrDefault()).To(Equal(42 * time.Minute))
})
})
Context("BackendUpgradeTimeoutOrDefault", func() {
It("returns 15 minutes when unset", func() {
c := config.DistributedConfig{}
Expect(c.BackendUpgradeTimeoutOrDefault()).To(Equal(15 * time.Minute))
})
It("returns the configured value when set", func() {
c := config.DistributedConfig{BackendUpgradeTimeout: 30 * time.Minute}
Expect(c.BackendUpgradeTimeoutOrDefault()).To(Equal(30 * time.Minute))
})
})
})