mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-17 04:56:52 -04:00
* fix(distributed): cascade-clean stale node_models on drain and filter routing by healthy status Stale node_models rows (state="loaded") were surviving past the healthy state of their owning node, causing /embeddings (and other inference paths) to dispatch to a backend whose process was gone or drained. The downstream symptom in a live cluster was pgvector rejecting inserts with "vector cannot have more than 16000 dimensions (SQLSTATE 54000)" because the misbehaving backend silently returned a malformed (oversized) tensor; the Models page showed the model as "running" without an associated node, like a stale entry, even though the node was no longer visible in the Nodes view. Two changes here, plus a third in a follow-up commit: - MarkDraining now cascade-deletes node_models rows for the affected node, mirroring MarkOffline. Drains are explicit operator actions — the box has been intentionally taken out of rotation — so clearing the rows stops the Models UI from misreporting and prevents the routing layer from picking those rows if scheduling logic is ever relaxed. In-flight requests already hold their gRPC client through Route() and finish normally; the only observable effect is a non-fatal IncrementInFlight warning, acceptable for a drain. MarkUnhealthy is deliberately left status-only: it fires from managers_distributed / reconciler on a single nats.ErrNoResponders with no retry, so a transient NATS hiccup must not nuke every loaded model and force a full reload on recovery. - FindAndLockNodeWithModel's inner JOIN now filters on backend_nodes.status = healthy in addition to node_models.state = loaded. The previous version relied on the second node-fetch step to reject non-healthy nodes, but a concurrent reader could still pick the same stale row in the same window. Belt-and-braces. - DistributedConfig.PerModelHealthCheck renamed to DisablePerModelHealthCheck and inverted at the call site so per-model gRPC probing is on by default. The probe (now made consecutive-miss aware in a follow-up commit) independently health- checks each model's gRPC address and removes stale node_models rows when the backend has crashed even though the worker's node-level heartbeat is still arriving. Migration: the field had no CLI flag, env var binding, or YAML key in tree (only the bare struct field), so there is no user-facing migration. Anything constructing DistributedConfig in code needs to drop the assignment (default now does the right thing) or invert it. Assisted-by: Claude:claude-opus-4-7 go-vet go-test golangci-lint Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(distributed): require consecutive misses before per-model probe removes a row The per-model gRPC probe used to remove a node_models row on a single failed health check. With the per-model probe now on by default, that made any 5-second gRPC blip (network jitter, a long-running request hogging the worker's gRPC server thread, brief GC pause) trigger a full reload of the affected model — too eager for production. Require perModelMissThreshold (3) consecutive failed probes before removal. At the default 15s tick a model must be unreachable for ~45s before reap; a single successful probe in between resets the streak. Per-(node, model, replica) state tracked under a mutex on the monitor. If the removal call itself fails, the miss counter is left in place so the next tick retries rather than starting the streak over. Tests: - removes stale model via per-model health check after consecutive failures (replaces the single-shot expectation) - preserves model row when an intermittent failure is followed by a success (covers the reset-on-success path and verifies the counter reset by failing twice more without crossing threshold) - newTestHealthMonitor initializes the misses map so direct-construct test helpers don't nil-map-panic in the probe path Assisted-by: Claude:claude-opus-4-7 go-vet go-test golangci-lint Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
197 lines
7.2 KiB
Go
197 lines
7.2 KiB
Go
package config
|
|
|
|
import (
|
|
"cmp"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
// DistributedConfig holds configuration for horizontal scaling mode.
|
|
// When Enabled is true, PostgreSQL and NATS are required.
|
|
type DistributedConfig struct {
|
|
Enabled bool // --distributed / LOCALAI_DISTRIBUTED
|
|
InstanceID string // --instance-id / LOCALAI_INSTANCE_ID (auto-generated UUID if empty)
|
|
NatsURL string // --nats-url / LOCALAI_NATS_URL
|
|
StorageURL string // --storage-url / LOCALAI_STORAGE_URL (S3 endpoint)
|
|
RegistrationToken string // --registration-token / LOCALAI_REGISTRATION_TOKEN (required token for node registration)
|
|
AutoApproveNodes bool // --auto-approve-nodes / LOCALAI_AUTO_APPROVE_NODES (skip admin approval for new workers)
|
|
|
|
// S3 configuration (used when StorageURL is set)
|
|
StorageBucket string // --storage-bucket / LOCALAI_STORAGE_BUCKET
|
|
StorageRegion string // --storage-region / LOCALAI_STORAGE_REGION
|
|
StorageAccessKey string // --storage-access-key / LOCALAI_STORAGE_ACCESS_KEY
|
|
StorageSecretKey string // --storage-secret-key / LOCALAI_STORAGE_SECRET_KEY
|
|
|
|
// Timeout configuration (all have sensible defaults — zero means use default)
|
|
MCPToolTimeout time.Duration // MCP tool execution timeout (default 360s)
|
|
MCPDiscoveryTimeout time.Duration // MCP discovery timeout (default 60s)
|
|
WorkerWaitTimeout time.Duration // Max wait for healthy worker at startup (default 5m)
|
|
DrainTimeout time.Duration // Time to wait for in-flight requests during drain (default 30s)
|
|
HealthCheckInterval time.Duration // Health monitor check interval (default 15s)
|
|
StaleNodeThreshold time.Duration // Time before a node is considered stale (default 60s)
|
|
// DisablePerModelHealthCheck turns off the health monitor's per-model
|
|
// gRPC probe. When enabled (the default), the monitor pings each model's
|
|
// gRPC address and removes stale node_models rows whose backend has
|
|
// crashed even though the worker's node-level heartbeat is still arriving.
|
|
// Without per-model probing, /embeddings and /completions can be dispatched
|
|
// to a backend that silently returns garbage (see also the cascading
|
|
// model-row cleanup on MarkUnhealthy / MarkDraining).
|
|
DisablePerModelHealthCheck bool
|
|
|
|
MCPCIJobTimeout time.Duration // MCP CI job execution timeout (default 10m)
|
|
|
|
MaxUploadSize int64 // Maximum upload body size in bytes (default 50 GB)
|
|
|
|
AgentWorkerConcurrency int `yaml:"agent_worker_concurrency" json:"agent_worker_concurrency" env:"LOCALAI_AGENT_WORKER_CONCURRENCY"`
|
|
JobWorkerConcurrency int `yaml:"job_worker_concurrency" json:"job_worker_concurrency" env:"LOCALAI_JOB_WORKER_CONCURRENCY"`
|
|
}
|
|
|
|
// Validate checks that the distributed configuration is internally consistent.
|
|
// It returns nil if distributed mode is disabled.
|
|
func (c DistributedConfig) Validate() error {
|
|
if !c.Enabled {
|
|
return nil
|
|
}
|
|
if c.NatsURL == "" {
|
|
return fmt.Errorf("distributed mode requires --nats-url / LOCALAI_NATS_URL")
|
|
}
|
|
// S3 credentials must be paired
|
|
if (c.StorageAccessKey != "" && c.StorageSecretKey == "") ||
|
|
(c.StorageAccessKey == "" && c.StorageSecretKey != "") {
|
|
return fmt.Errorf("storage-access-key and storage-secret-key must both be set or both empty")
|
|
}
|
|
// Warn about missing registration token (not an error)
|
|
if c.RegistrationToken == "" {
|
|
xlog.Warn("distributed mode running without registration token — node endpoints are unprotected")
|
|
}
|
|
// Check for negative durations
|
|
for name, d := range map[string]time.Duration{
|
|
"mcp-tool-timeout": c.MCPToolTimeout,
|
|
"mcp-discovery-timeout": c.MCPDiscoveryTimeout,
|
|
"worker-wait-timeout": c.WorkerWaitTimeout,
|
|
"drain-timeout": c.DrainTimeout,
|
|
"health-check-interval": c.HealthCheckInterval,
|
|
"stale-node-threshold": c.StaleNodeThreshold,
|
|
"mcp-ci-job-timeout": c.MCPCIJobTimeout,
|
|
} {
|
|
if d < 0 {
|
|
return fmt.Errorf("%s must not be negative", name)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Distributed config options
|
|
|
|
var EnableDistributed = func(o *ApplicationConfig) {
|
|
o.Distributed.Enabled = true
|
|
}
|
|
|
|
func WithDistributedInstanceID(id string) AppOption {
|
|
return func(o *ApplicationConfig) {
|
|
o.Distributed.InstanceID = id
|
|
}
|
|
}
|
|
|
|
func WithNatsURL(url string) AppOption {
|
|
return func(o *ApplicationConfig) {
|
|
o.Distributed.NatsURL = url
|
|
}
|
|
}
|
|
|
|
func WithRegistrationToken(token string) AppOption {
|
|
return func(o *ApplicationConfig) {
|
|
o.Distributed.RegistrationToken = token
|
|
}
|
|
}
|
|
|
|
func WithStorageURL(url string) AppOption {
|
|
return func(o *ApplicationConfig) {
|
|
o.Distributed.StorageURL = url
|
|
}
|
|
}
|
|
|
|
func WithStorageBucket(bucket string) AppOption {
|
|
return func(o *ApplicationConfig) {
|
|
o.Distributed.StorageBucket = bucket
|
|
}
|
|
}
|
|
|
|
func WithStorageRegion(region string) AppOption {
|
|
return func(o *ApplicationConfig) {
|
|
o.Distributed.StorageRegion = region
|
|
}
|
|
}
|
|
|
|
func WithStorageAccessKey(key string) AppOption {
|
|
return func(o *ApplicationConfig) {
|
|
o.Distributed.StorageAccessKey = key
|
|
}
|
|
}
|
|
|
|
func WithStorageSecretKey(key string) AppOption {
|
|
return func(o *ApplicationConfig) {
|
|
o.Distributed.StorageSecretKey = key
|
|
}
|
|
}
|
|
|
|
var EnableAutoApproveNodes = func(o *ApplicationConfig) {
|
|
o.Distributed.AutoApproveNodes = true
|
|
}
|
|
|
|
// Defaults for distributed timeouts.
|
|
const (
|
|
DefaultMCPToolTimeout = 360 * time.Second
|
|
DefaultMCPDiscoveryTimeout = 60 * time.Second
|
|
DefaultWorkerWaitTimeout = 5 * time.Minute
|
|
DefaultDrainTimeout = 30 * time.Second
|
|
DefaultHealthCheckInterval = 15 * time.Second
|
|
DefaultStaleNodeThreshold = 60 * time.Second
|
|
DefaultMCPCIJobTimeout = 10 * time.Minute
|
|
)
|
|
|
|
// DefaultMaxUploadSize is the default maximum upload body size (50 GB).
|
|
const DefaultMaxUploadSize int64 = 50 << 30
|
|
|
|
// MCPToolTimeoutOrDefault returns the configured timeout or the default.
|
|
func (c DistributedConfig) MCPToolTimeoutOrDefault() time.Duration {
|
|
return cmp.Or(c.MCPToolTimeout, DefaultMCPToolTimeout)
|
|
}
|
|
|
|
// MCPDiscoveryTimeoutOrDefault returns the configured timeout or the default.
|
|
func (c DistributedConfig) MCPDiscoveryTimeoutOrDefault() time.Duration {
|
|
return cmp.Or(c.MCPDiscoveryTimeout, DefaultMCPDiscoveryTimeout)
|
|
}
|
|
|
|
// WorkerWaitTimeoutOrDefault returns the configured timeout or the default.
|
|
func (c DistributedConfig) WorkerWaitTimeoutOrDefault() time.Duration {
|
|
return cmp.Or(c.WorkerWaitTimeout, DefaultWorkerWaitTimeout)
|
|
}
|
|
|
|
// DrainTimeoutOrDefault returns the configured timeout or the default.
|
|
func (c DistributedConfig) DrainTimeoutOrDefault() time.Duration {
|
|
return cmp.Or(c.DrainTimeout, DefaultDrainTimeout)
|
|
}
|
|
|
|
// HealthCheckIntervalOrDefault returns the configured interval or the default.
|
|
func (c DistributedConfig) HealthCheckIntervalOrDefault() time.Duration {
|
|
return cmp.Or(c.HealthCheckInterval, DefaultHealthCheckInterval)
|
|
}
|
|
|
|
// StaleNodeThresholdOrDefault returns the configured threshold or the default.
|
|
func (c DistributedConfig) StaleNodeThresholdOrDefault() time.Duration {
|
|
return cmp.Or(c.StaleNodeThreshold, DefaultStaleNodeThreshold)
|
|
}
|
|
|
|
// MCPCIJobTimeoutOrDefault returns the configured MCP CI job timeout or the default.
|
|
func (c DistributedConfig) MCPCIJobTimeoutOrDefault() time.Duration {
|
|
return cmp.Or(c.MCPCIJobTimeout, DefaultMCPCIJobTimeout)
|
|
}
|
|
|
|
// MaxUploadSizeOrDefault returns the configured max upload size or the default.
|
|
func (c DistributedConfig) MaxUploadSizeOrDefault() int64 {
|
|
return cmp.Or(c.MaxUploadSize, DefaultMaxUploadSize)
|
|
}
|