mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-16 12:49:08 -04:00
* fix(distributed): cascade-clean stale node_models on drain and filter routing by healthy status Stale node_models rows (state="loaded") were surviving past the healthy state of their owning node, causing /embeddings (and other inference paths) to dispatch to a backend whose process was gone or drained. The downstream symptom in a live cluster was pgvector rejecting inserts with "vector cannot have more than 16000 dimensions (SQLSTATE 54000)" because the misbehaving backend silently returned a malformed (oversized) tensor; the Models page showed the model as "running" without an associated node, like a stale entry, even though the node was no longer visible in the Nodes view. Two changes here, plus a third in a follow-up commit: - MarkDraining now cascade-deletes node_models rows for the affected node, mirroring MarkOffline. Drains are explicit operator actions — the box has been intentionally taken out of rotation — so clearing the rows stops the Models UI from misreporting and prevents the routing layer from picking those rows if scheduling logic is ever relaxed. In-flight requests already hold their gRPC client through Route() and finish normally; the only observable effect is a non-fatal IncrementInFlight warning, acceptable for a drain. MarkUnhealthy is deliberately left status-only: it fires from managers_distributed / reconciler on a single nats.ErrNoResponders with no retry, so a transient NATS hiccup must not nuke every loaded model and force a full reload on recovery. - FindAndLockNodeWithModel's inner JOIN now filters on backend_nodes.status = healthy in addition to node_models.state = loaded. The previous version relied on the second node-fetch step to reject non-healthy nodes, but a concurrent reader could still pick the same stale row in the same window. Belt-and-braces. - DistributedConfig.PerModelHealthCheck renamed to DisablePerModelHealthCheck and inverted at the call site so per-model gRPC probing is on by default. The probe (now made consecutive-miss aware in a follow-up commit) independently health- checks each model's gRPC address and removes stale node_models rows when the backend has crashed even though the worker's node-level heartbeat is still arriving. Migration: the field had no CLI flag, env var binding, or YAML key in tree (only the bare struct field), so there is no user-facing migration. Anything constructing DistributedConfig in code needs to drop the assignment (default now does the right thing) or invert it. Assisted-by: Claude:claude-opus-4-7 go-vet go-test golangci-lint Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(distributed): require consecutive misses before per-model probe removes a row The per-model gRPC probe used to remove a node_models row on a single failed health check. With the per-model probe now on by default, that made any 5-second gRPC blip (network jitter, a long-running request hogging the worker's gRPC server thread, brief GC pause) trigger a full reload of the affected model — too eager for production. Require perModelMissThreshold (3) consecutive failed probes before removal. At the default 15s tick a model must be unreachable for ~45s before reap; a single successful probe in between resets the streak. Per-(node, model, replica) state tracked under a mutex on the monitor. If the removal call itself fails, the miss counter is left in place so the next tick retries rather than starting the streak over. Tests: - removes stale model via per-model health check after consecutive failures (replaces the single-shot expectation) - preserves model row when an intermittent failure is followed by a success (covers the reset-on-success path and verifies the counter reset by failing twice more without crossing threshold) - newTestHealthMonitor initializes the misses map so direct-construct test helpers don't nil-map-panic in the probe path Assisted-by: Claude:claude-opus-4-7 go-vet go-test golangci-lint Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
230 lines
8.0 KiB
Go
230 lines
8.0 KiB
Go
package nodes
|
|
|
|
import (
|
|
"cmp"
|
|
"context"
|
|
"io"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/mudler/LocalAI/core/services/advisorylock"
|
|
"github.com/mudler/xlog"
|
|
"gorm.io/gorm"
|
|
)
|
|
|
|
// perModelMissThreshold is the number of consecutive failed gRPC probes
|
|
// against a model's backend before the model is removed from the registry.
|
|
// A single failure can be transient (network blip, brief GC pause on the
|
|
// worker, a long-running request hogging the gRPC server thread); requiring
|
|
// N consecutive misses avoids deleting healthy rows over noise. At the
|
|
// default 15s tick this means a model has to be unreachable for ~45s before
|
|
// it gets reaped.
|
|
const perModelMissThreshold = 3
|
|
|
|
// modelKey identifies a specific (node, model, replica) tuple. We track miss
|
|
// counts per tuple because the same model name can be loaded on multiple
|
|
// replicas on the same node.
|
|
type modelKey struct {
|
|
NodeID string
|
|
ModelName string
|
|
ReplicaIndex int
|
|
}
|
|
|
|
// HealthMonitor periodically checks the health of registered backend nodes.
|
|
type HealthMonitor struct {
|
|
registry NodeHealthStore
|
|
db *gorm.DB // if non-nil, use advisory lock so only one frontend runs checks
|
|
checkInterval time.Duration
|
|
staleThreshold time.Duration
|
|
autoOffline bool // mark stale nodes as offline (preserves approval status)
|
|
clientFactory BackendClientFactory // creates gRPC backend clients
|
|
perModelHealthCheck bool // check each model's backend process individually
|
|
missesMu sync.Mutex
|
|
misses map[modelKey]int // consecutive failed-probe counts; reset on success or model removal
|
|
cancel context.CancelFunc
|
|
cancelMu sync.Mutex
|
|
}
|
|
|
|
// NewHealthMonitor creates a new HealthMonitor.
|
|
// If db is non-nil (PostgreSQL), an advisory lock is used so that only one
|
|
// frontend instance runs health checks at a time in distributed mode.
|
|
// If clientFactory is nil, a default factory using the given authToken is used.
|
|
func NewHealthMonitor(registry NodeHealthStore, db *gorm.DB, checkInterval, staleThreshold time.Duration, authToken string, perModelHealthCheck bool, clientFactory ...BackendClientFactory) *HealthMonitor {
|
|
checkInterval = cmp.Or(checkInterval, 15*time.Second)
|
|
staleThreshold = cmp.Or(staleThreshold, 60*time.Second)
|
|
var factory BackendClientFactory
|
|
if len(clientFactory) > 0 && clientFactory[0] != nil {
|
|
factory = clientFactory[0]
|
|
} else {
|
|
factory = &tokenClientFactory{token: authToken}
|
|
}
|
|
return &HealthMonitor{
|
|
registry: registry,
|
|
db: db,
|
|
checkInterval: checkInterval,
|
|
staleThreshold: staleThreshold,
|
|
autoOffline: true,
|
|
clientFactory: factory,
|
|
perModelHealthCheck: perModelHealthCheck,
|
|
misses: make(map[modelKey]int),
|
|
}
|
|
}
|
|
|
|
// Start begins the health monitoring loop in a background goroutine.
|
|
// If a previous instance is running, it is stopped first.
|
|
func (hm *HealthMonitor) Start(ctx context.Context) {
|
|
hm.cancelMu.Lock()
|
|
if hm.cancel != nil {
|
|
hm.cancel() // stop previous instance
|
|
}
|
|
ctx, hm.cancel = context.WithCancel(ctx)
|
|
hm.cancelMu.Unlock()
|
|
go hm.run(ctx)
|
|
}
|
|
|
|
// Stop stops the health monitoring loop.
|
|
func (hm *HealthMonitor) Stop() {
|
|
hm.cancelMu.Lock()
|
|
defer hm.cancelMu.Unlock()
|
|
if hm.cancel != nil {
|
|
hm.cancel()
|
|
hm.cancel = nil
|
|
}
|
|
}
|
|
|
|
func (hm *HealthMonitor) run(ctx context.Context) {
|
|
ticker := time.NewTicker(hm.checkInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
hm.checkAll(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (hm *HealthMonitor) checkAll(ctx context.Context) {
|
|
// In distributed mode, use an advisory lock so only one frontend runs checks
|
|
if hm.db != nil {
|
|
acquired, err := advisorylock.TryWithLockCtx(ctx, hm.db, advisorylock.KeyHealthCheck, func() error {
|
|
hm.doCheckAll(ctx)
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
xlog.Error("Health monitor advisory lock error", "error", err)
|
|
}
|
|
_ = acquired
|
|
return
|
|
}
|
|
|
|
hm.doCheckAll(ctx)
|
|
}
|
|
|
|
// doCheckAll performs the actual health check logic for all nodes.
|
|
// Node liveness is determined by heartbeat freshness — both backend and agent
|
|
// workers send periodic HTTP heartbeats to the frontend, so a stale heartbeat
|
|
// means the worker supervisor is down. This is simpler and more reliable than
|
|
// probing individual gRPC backend processes (which can crash independently).
|
|
//
|
|
// Per-model health checks (opt-in) separately probe each model's gRPC address
|
|
// and remove stale model records without affecting the node's overall status.
|
|
func (hm *HealthMonitor) doCheckAll(ctx context.Context) {
|
|
nodes, err := hm.registry.List(ctx)
|
|
if err != nil {
|
|
xlog.Error("Health monitor: failed to list nodes", "error", err)
|
|
return
|
|
}
|
|
|
|
for _, node := range nodes {
|
|
if node.Status == StatusDraining {
|
|
continue
|
|
}
|
|
|
|
// Node liveness: heartbeat staleness check.
|
|
// Workers (both backend and agent) send HTTP heartbeats to the frontend.
|
|
// If the heartbeat is stale, the worker is presumed down.
|
|
if time.Since(node.LastHeartbeat) > hm.staleThreshold {
|
|
// Skip nodes already marked offline/unhealthy — re-marking them
|
|
// every cycle floods the log with the same WARN+INFO pair for
|
|
// nodes the operator has intentionally taken down.
|
|
if node.Status == StatusOffline || node.Status == StatusUnhealthy {
|
|
continue
|
|
}
|
|
xlog.Warn("Node heartbeat stale", "node", node.Name, "lastHeartbeat", node.LastHeartbeat)
|
|
if hm.autoOffline {
|
|
xlog.Info("Marking stale node offline", "node", node.Name)
|
|
if err := hm.registry.MarkOffline(ctx, node.ID); err != nil {
|
|
xlog.Error("Failed to mark stale node offline", "node", node.Name, "error", err)
|
|
}
|
|
} else {
|
|
hm.registry.MarkUnhealthy(ctx, node.ID)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Heartbeat is fresh — node is alive
|
|
if node.Status == StatusUnhealthy || node.Status == StatusOffline {
|
|
xlog.Info("Node recovered", "node", node.Name)
|
|
if err := hm.registry.MarkHealthy(ctx, node.ID); err != nil {
|
|
xlog.Error("Failed to mark node healthy", "node", node.Name, "error", err)
|
|
}
|
|
}
|
|
|
|
// Per-model backend health check: probe each model's gRPC address and
|
|
// remove stale model records. This does NOT affect the node's status —
|
|
// a crashed backend process is a model-level issue, not a node-level
|
|
// one. A model is only removed after perModelMissThreshold consecutive
|
|
// failed probes so a single network/GC blip doesn't force a reload.
|
|
if hm.perModelHealthCheck {
|
|
models, _ := hm.registry.GetNodeModels(ctx, node.ID)
|
|
for _, m := range models {
|
|
if m.Address == "" || m.Address == node.Address {
|
|
continue
|
|
}
|
|
mClient := hm.clientFactory.NewClient(m.Address, false)
|
|
mCheckCtx, mCancel := context.WithTimeout(ctx, 5*time.Second)
|
|
ok, _ := mClient.HealthCheck(mCheckCtx)
|
|
mCancel()
|
|
if closer, ok := mClient.(io.Closer); ok {
|
|
closer.Close()
|
|
}
|
|
|
|
key := modelKey{NodeID: node.ID, ModelName: m.ModelName, ReplicaIndex: m.ReplicaIndex}
|
|
hm.missesMu.Lock()
|
|
if ok {
|
|
// Probe succeeded — wipe any previous miss streak.
|
|
delete(hm.misses, key)
|
|
hm.missesMu.Unlock()
|
|
continue
|
|
}
|
|
hm.misses[key]++
|
|
misses := hm.misses[key]
|
|
hm.missesMu.Unlock()
|
|
|
|
if misses < perModelMissThreshold {
|
|
xlog.Debug("Model backend probe failed, awaiting threshold before removal",
|
|
"node", node.ID, "model", m.ModelName, "replica", m.ReplicaIndex,
|
|
"address", m.Address, "misses", misses, "threshold", perModelMissThreshold)
|
|
continue
|
|
}
|
|
xlog.Warn("Model backend unhealthy after consecutive misses, removing from registry",
|
|
"node", node.ID, "model", m.ModelName, "replica", m.ReplicaIndex,
|
|
"address", m.Address, "misses", misses)
|
|
if err := hm.registry.RemoveNodeModel(ctx, node.ID, m.ModelName, m.ReplicaIndex); err != nil {
|
|
xlog.Warn("Failed to remove unhealthy model from registry",
|
|
"node", node.ID, "model", m.ModelName, "replica", m.ReplicaIndex, "error", err)
|
|
// Leave the miss counter in place so the next tick retries
|
|
// the removal rather than starting the streak over.
|
|
continue
|
|
}
|
|
hm.missesMu.Lock()
|
|
delete(hm.misses, key)
|
|
hm.missesMu.Unlock()
|
|
}
|
|
}
|
|
}
|
|
}
|