Files
LocalAI/core/services/nodes/health_test.go
LocalAI [bot] b4fdb41dcc fix(distributed): cascade-clean stale node_models rows + filter routing by healthy status (#9754)
* fix(distributed): cascade-clean stale node_models on drain and filter routing by healthy status

Stale node_models rows (state="loaded") were surviving past the healthy
state of their owning node, causing /embeddings (and other inference
paths) to dispatch to a backend whose process was gone or drained. The
downstream symptom in a live cluster was pgvector rejecting inserts
with "vector cannot have more than 16000 dimensions (SQLSTATE 54000)"
because the misbehaving backend silently returned a malformed
(oversized) tensor; the Models page showed the model as "running"
without an associated node, like a stale entry, even though the node
was no longer visible in the Nodes view.

Two changes here, plus a third in a follow-up commit:

- MarkDraining now cascade-deletes node_models rows for the affected
  node, mirroring MarkOffline. Drains are explicit operator actions —
  the box has been intentionally taken out of rotation — so clearing
  the rows stops the Models UI from misreporting and prevents the
  routing layer from picking those rows if scheduling logic is ever
  relaxed. In-flight requests already hold their gRPC client through
  Route() and finish normally; the only observable effect is a
  non-fatal IncrementInFlight warning, acceptable for a drain.

  MarkUnhealthy is deliberately left status-only: it fires from
  managers_distributed / reconciler on a single nats.ErrNoResponders
  with no retry, so a transient NATS hiccup must not nuke every loaded
  model and force a full reload on recovery.

- FindAndLockNodeWithModel's inner JOIN now filters on
  backend_nodes.status = healthy in addition to node_models.state =
  loaded. The previous version relied on the second node-fetch step to
  reject non-healthy nodes, but a concurrent reader could still pick
  the same stale row in the same window. Belt-and-braces.

- DistributedConfig.PerModelHealthCheck renamed to
  DisablePerModelHealthCheck and inverted at the call site so
  per-model gRPC probing is on by default. The probe (now made
  consecutive-miss aware in a follow-up commit) independently health-
  checks each model's gRPC address and removes stale node_models rows
  when the backend has crashed even though the worker's node-level
  heartbeat is still arriving.

  Migration: the field had no CLI flag, env var binding, or YAML key
  in tree (only the bare struct field), so there is no user-facing
  migration. Anything constructing DistributedConfig in code needs to
  drop the assignment (default now does the right thing) or invert it.

Assisted-by: Claude:claude-opus-4-7 go-vet go-test golangci-lint
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(distributed): require consecutive misses before per-model probe removes a row

The per-model gRPC probe used to remove a node_models row on a single
failed health check. With the per-model probe now on by default, that
made any 5-second gRPC blip (network jitter, a long-running request
hogging the worker's gRPC server thread, brief GC pause) trigger a
full reload of the affected model — too eager for production.

Require perModelMissThreshold (3) consecutive failed probes before
removal. At the default 15s tick a model must be unreachable for ~45s
before reap; a single successful probe in between resets the streak.
Per-(node, model, replica) state tracked under a mutex on the monitor.

If the removal call itself fails, the miss counter is left in place
so the next tick retries rather than starting the streak over.

Tests:
- removes stale model via per-model health check after consecutive
  failures (replaces the single-shot expectation)
- preserves model row when an intermittent failure is followed by a
  success (covers the reset-on-success path and verifies the counter
  reset by failing twice more without crossing threshold)
- newTestHealthMonitor initializes the misses map so direct-construct
  test helpers don't nil-map-panic in the probe path

Assisted-by: Claude:claude-opus-4-7 go-vet go-test golangci-lint
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-13 21:57:50 +02:00

320 lines
12 KiB
Go

package nodes
import (
"context"
"fmt"
"runtime"
"time"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/mudler/LocalAI/core/services/testutil"
"gorm.io/gorm"
)
var _ = Describe("HealthMonitor", func() {
var (
db *gorm.DB
registry *NodeRegistry
hm *HealthMonitor
)
BeforeEach(func() {
if runtime.GOOS == "darwin" {
Skip("testcontainers requires Docker, not available on macOS CI")
}
db = testutil.SetupTestDB()
var err error
registry, err = NewNodeRegistry(db)
Expect(err).ToNot(HaveOccurred())
// Use a 30-second stale threshold for tests.
// Pass nil db to avoid advisory lock path (no distributed mode in tests).
hm = NewHealthMonitor(registry, nil, 15*time.Second, 30*time.Second, "", false)
})
makeNode := func(name, address string, vram uint64) *BackendNode {
return &BackendNode{
Name: name,
NodeType: NodeTypeBackend,
Address: address,
TotalVRAM: vram,
AvailableVRAM: vram,
}
}
Describe("doCheckAll", func() {
It("marks stale node offline", func() {
node := makeNode("stale-worker", "10.0.0.1:50051", 8_000_000_000)
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
Expect(node.Status).To(Equal(StatusHealthy))
// Set LastHeartbeat to 2 minutes ago (well beyond 30s threshold)
staleTime := time.Now().Add(-2 * time.Minute)
Expect(db.Model(&BackendNode{}).Where("id = ?", node.ID).
Update("last_heartbeat", staleTime).Error).ToNot(HaveOccurred())
hm.doCheckAll(context.Background())
fetched, err := registry.Get(context.Background(), node.ID)
Expect(err).ToNot(HaveOccurred())
Expect(fetched.Status).To(Equal(StatusOffline))
})
It("skips draining nodes", func() {
node := makeNode("draining-worker", "10.0.0.2:50051", 8_000_000_000)
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
// Set status to draining
Expect(db.Model(&BackendNode{}).Where("id = ?", node.ID).
Update("status", StatusDraining).Error).ToNot(HaveOccurred())
// Make heartbeat stale
staleTime := time.Now().Add(-2 * time.Minute)
Expect(db.Model(&BackendNode{}).Where("id = ?", node.ID).
Update("last_heartbeat", staleTime).Error).ToNot(HaveOccurred())
hm.doCheckAll(context.Background())
fetched, err := registry.Get(context.Background(), node.ID)
Expect(err).ToNot(HaveOccurred())
Expect(fetched.Status).To(Equal(StatusDraining))
})
It("skips idle nodes with no loaded models", func() {
node := makeNode("idle-worker", "10.0.0.3:50051", 8_000_000_000)
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
// Heartbeat is fresh (just registered), no models loaded.
// doCheckAll should not change status (no gRPC check attempted).
hm.doCheckAll(context.Background())
fetched, err := registry.Get(context.Background(), node.ID)
Expect(err).ToNot(HaveOccurred())
Expect(fetched.Status).To(Equal(StatusHealthy))
})
It("recovers unhealthy node when heartbeat is fresh", func() {
node := makeNode("unhealthy-worker", "10.0.0.5:50051", 8_000_000_000)
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
Expect(node.Status).To(Equal(StatusHealthy))
// Mark unhealthy
Expect(registry.MarkUnhealthy(context.Background(), node.ID)).To(Succeed())
fetched, err := registry.Get(context.Background(), node.ID)
Expect(err).ToNot(HaveOccurred())
Expect(fetched.Status).To(Equal(StatusUnhealthy))
// Refresh heartbeat (simulates the worker sending a heartbeat)
Expect(db.Model(&BackendNode{}).Where("id = ?", node.ID).
Update("last_heartbeat", time.Now()).Error).ToNot(HaveOccurred())
hm.doCheckAll(context.Background())
fetched, err = registry.Get(context.Background(), node.ID)
Expect(err).ToNot(HaveOccurred())
Expect(fetched.Status).To(Equal(StatusHealthy))
})
It("does not change healthy nodes with fresh heartbeat", func() {
node := makeNode("fresh-worker", "10.0.0.4:50051", 8_000_000_000)
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
// Update heartbeat to now so it is definitely fresh
Expect(db.Model(&BackendNode{}).Where("id = ?", node.ID).
Update("last_heartbeat", time.Now()).Error).ToNot(HaveOccurred())
hm.doCheckAll(context.Background())
fetched, err := registry.Get(context.Background(), node.ID)
Expect(err).ToNot(HaveOccurred())
Expect(fetched.Status).To(Equal(StatusHealthy))
})
})
})
// --- Mock-based tests (no DB required) ---
var _ = Describe("HealthMonitor (mock-based)", func() {
const staleThreshold = 30 * time.Second
Describe("doCheckAll", func() {
It("marks stale node offline when autoOffline=true", func() {
store := newFakeNodeHealthStore()
factory := newFakeBackendClientFactory()
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
node := makeTestNode("node-1", "stale-worker", "10.0.0.1:50051", StatusHealthy, staleTime(staleThreshold))
store.addNode(node)
hm.doCheckAll(context.Background())
Expect(store.getNode("node-1").Status).To(Equal(StatusOffline))
Expect(store.getCalls()).To(ContainElement("MarkOffline:node-1"))
})
It("marks stale node unhealthy when autoOffline=false", func() {
store := newFakeNodeHealthStore()
factory := newFakeBackendClientFactory()
hm := newTestHealthMonitor(store, factory, false, staleThreshold)
node := makeTestNode("node-2", "stale-worker-2", "10.0.0.2:50051", StatusHealthy, staleTime(staleThreshold))
store.addNode(node)
hm.doCheckAll(context.Background())
Expect(store.getNode("node-2").Status).To(Equal(StatusUnhealthy))
Expect(store.getCalls()).To(ContainElement("MarkUnhealthy:node-2"))
})
It("skips draining nodes", func() {
store := newFakeNodeHealthStore()
factory := newFakeBackendClientFactory()
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
node := makeTestNode("node-3", "draining-worker", "10.0.0.3:50051", StatusDraining, staleTime(staleThreshold))
store.addNode(node)
hm.doCheckAll(context.Background())
// Should remain draining -- no MarkOffline or MarkUnhealthy
Expect(store.getNode("node-3").Status).To(Equal(StatusDraining))
calls := store.getCalls()
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkOffline")))
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkUnhealthy")))
})
It("skips idle nodes with no models", func() {
store := newFakeNodeHealthStore()
factory := newFakeBackendClientFactory()
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
node := makeTestNode("node-4", "idle-worker", "10.0.0.4:50051", StatusHealthy, freshTime())
store.addNode(node)
// No models added for this node
hm.doCheckAll(context.Background())
// Should remain healthy -- no gRPC check attempted
Expect(store.getNode("node-4").Status).To(Equal(StatusHealthy))
calls := store.getCalls()
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkUnhealthy")))
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkOffline")))
})
It("keeps node healthy when heartbeat is fresh (with models loaded)", func() {
store := newFakeNodeHealthStore()
factory := newFakeBackendClientFactory()
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
node := makeTestNode("node-5", "active-worker", "10.0.0.5:50051", StatusHealthy, freshTime())
store.addNode(node)
store.addNodeModel("node-5", NodeModel{NodeID: "node-5", ModelName: "llama-7b"})
// No gRPC client needed — health is determined by heartbeat, not gRPC probe
hm.doCheckAll(context.Background())
Expect(store.getNode("node-5").Status).To(Equal(StatusHealthy))
calls := store.getCalls()
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkUnhealthy")))
})
It("recovers unhealthy node when heartbeat is fresh", func() {
store := newFakeNodeHealthStore()
factory := newFakeBackendClientFactory()
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
node := makeTestNode("node-6", "recovering-worker", "10.0.0.6:50051", StatusUnhealthy, freshTime())
store.addNode(node)
hm.doCheckAll(context.Background())
Expect(store.getCalls()).To(ContainElement("MarkHealthy:node-6"))
Expect(store.getNode("node-6").Status).To(Equal(StatusHealthy))
})
It("node stays healthy when gRPC backend crashes but heartbeat is fresh", func() {
store := newFakeNodeHealthStore()
factory := newFakeBackendClientFactory()
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
// Worker has a model loaded but the backend process crashed —
// node should remain healthy because heartbeat is fresh
node := makeTestNode("node-crash", "crash-worker", "10.0.0.9:50051", StatusHealthy, freshTime())
store.addNode(node)
store.addNodeModel("node-crash", NodeModel{NodeID: "node-crash", ModelName: "piper-model", Address: "10.0.0.9:50053"})
// gRPC backend is dead — but health is heartbeat-based, not gRPC-based
factory.setClient("10.0.0.9:50051", &fakeBackendClient{healthy: false, err: fmt.Errorf("connection refused")})
hm.doCheckAll(context.Background())
Expect(store.getNode("node-crash").Status).To(Equal(StatusHealthy))
calls := store.getCalls()
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkUnhealthy")))
})
It("removes stale model via per-model health check after consecutive failures", func() {
store := newFakeNodeHealthStore()
factory := newFakeBackendClientFactory()
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
hm.perModelHealthCheck = true
node := makeTestNode("node-model", "model-worker", "10.0.0.10:50051", StatusHealthy, freshTime())
store.addNode(node)
store.addNodeModel("node-model", NodeModel{NodeID: "node-model", ModelName: "piper-model", Address: "10.0.0.10:50053"})
// Model backend is dead
factory.setClient("10.0.0.10:50053", &fakeBackendClient{healthy: false, err: fmt.Errorf("connection refused")})
// First (perModelMissThreshold-1) probes must NOT remove the row —
// a single failure could be a transient blip.
for i := 0; i < perModelMissThreshold-1; i++ {
hm.doCheckAll(context.Background())
Expect(store.getCalls()).NotTo(ContainElement(ContainSubstring("RemoveNodeModel")),
"removed too early at miss %d", i+1)
}
// Threshold-th consecutive miss triggers removal.
hm.doCheckAll(context.Background())
// Node should remain healthy — only the specific replica record is removed.
Expect(store.getNode("node-model").Status).To(Equal(StatusHealthy))
Expect(store.getCalls()).To(ContainElement("RemoveNodeModel:node-model:piper-model:0"))
Expect(store.getCalls()).NotTo(ContainElement(ContainSubstring("MarkUnhealthy")))
})
It("preserves model row when an intermittent failure is followed by a success", func() {
store := newFakeNodeHealthStore()
factory := newFakeBackendClientFactory()
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
hm.perModelHealthCheck = true
node := makeTestNode("node-flap", "flap-worker", "10.0.0.11:50051", StatusHealthy, freshTime())
store.addNode(node)
store.addNodeModel("node-flap", NodeModel{NodeID: "node-flap", ModelName: "piper-model", Address: "10.0.0.11:50053"})
deadClient := &fakeBackendClient{healthy: false, err: fmt.Errorf("connection refused")}
liveClient := &fakeBackendClient{healthy: true}
// Two failing probes then a recovery — should NOT remove the row,
// and should reset the miss counter so two more failures don't tip
// it over.
factory.setClient("10.0.0.11:50053", deadClient)
hm.doCheckAll(context.Background())
hm.doCheckAll(context.Background())
factory.setClient("10.0.0.11:50053", liveClient)
hm.doCheckAll(context.Background())
Expect(store.getCalls()).NotTo(ContainElement(ContainSubstring("RemoveNodeModel")))
// Counter is reset; two more failures must not be enough to remove.
factory.setClient("10.0.0.11:50053", deadClient)
hm.doCheckAll(context.Background())
hm.doCheckAll(context.Background())
Expect(store.getCalls()).NotTo(ContainElement(ContainSubstring("RemoveNodeModel")))
})
})
})