mirror of
https://github.com/mudler/LocalAI.git
synced 2026-07-02 20:37:03 -04:00
* fix(distributed): cascade-clean stale node_models on drain and filter routing by healthy status Stale node_models rows (state="loaded") were surviving past the healthy state of their owning node, causing /embeddings (and other inference paths) to dispatch to a backend whose process was gone or drained. The downstream symptom in a live cluster was pgvector rejecting inserts with "vector cannot have more than 16000 dimensions (SQLSTATE 54000)" because the misbehaving backend silently returned a malformed (oversized) tensor; the Models page showed the model as "running" without an associated node, like a stale entry, even though the node was no longer visible in the Nodes view. Two changes here, plus a third in a follow-up commit: - MarkDraining now cascade-deletes node_models rows for the affected node, mirroring MarkOffline. Drains are explicit operator actions — the box has been intentionally taken out of rotation — so clearing the rows stops the Models UI from misreporting and prevents the routing layer from picking those rows if scheduling logic is ever relaxed. In-flight requests already hold their gRPC client through Route() and finish normally; the only observable effect is a non-fatal IncrementInFlight warning, acceptable for a drain. MarkUnhealthy is deliberately left status-only: it fires from managers_distributed / reconciler on a single nats.ErrNoResponders with no retry, so a transient NATS hiccup must not nuke every loaded model and force a full reload on recovery. - FindAndLockNodeWithModel's inner JOIN now filters on backend_nodes.status = healthy in addition to node_models.state = loaded. The previous version relied on the second node-fetch step to reject non-healthy nodes, but a concurrent reader could still pick the same stale row in the same window. Belt-and-braces. - DistributedConfig.PerModelHealthCheck renamed to DisablePerModelHealthCheck and inverted at the call site so per-model gRPC probing is on by default. The probe (now made consecutive-miss aware in a follow-up commit) independently health- checks each model's gRPC address and removes stale node_models rows when the backend has crashed even though the worker's node-level heartbeat is still arriving. Migration: the field had no CLI flag, env var binding, or YAML key in tree (only the bare struct field), so there is no user-facing migration. Anything constructing DistributedConfig in code needs to drop the assignment (default now does the right thing) or invert it. Assisted-by: Claude:claude-opus-4-7 go-vet go-test golangci-lint Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(distributed): require consecutive misses before per-model probe removes a row The per-model gRPC probe used to remove a node_models row on a single failed health check. With the per-model probe now on by default, that made any 5-second gRPC blip (network jitter, a long-running request hogging the worker's gRPC server thread, brief GC pause) trigger a full reload of the affected model — too eager for production. Require perModelMissThreshold (3) consecutive failed probes before removal. At the default 15s tick a model must be unreachable for ~45s before reap; a single successful probe in between resets the streak. Per-(node, model, replica) state tracked under a mutex on the monitor. If the removal call itself fails, the miss counter is left in place so the next tick retries rather than starting the streak over. Tests: - removes stale model via per-model health check after consecutive failures (replaces the single-shot expectation) - preserves model row when an intermittent failure is followed by a success (covers the reset-on-success path and verifies the counter reset by failing twice more without crossing threshold) - newTestHealthMonitor initializes the misses map so direct-construct test helpers don't nil-map-panic in the probe path Assisted-by: Claude:claude-opus-4-7 go-vet go-test golangci-lint Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
320 lines
12 KiB
Go
320 lines
12 KiB
Go
package nodes
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"runtime"
|
|
"time"
|
|
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
|
|
"github.com/mudler/LocalAI/core/services/testutil"
|
|
"gorm.io/gorm"
|
|
)
|
|
|
|
var _ = Describe("HealthMonitor", func() {
|
|
var (
|
|
db *gorm.DB
|
|
registry *NodeRegistry
|
|
hm *HealthMonitor
|
|
)
|
|
|
|
BeforeEach(func() {
|
|
if runtime.GOOS == "darwin" {
|
|
Skip("testcontainers requires Docker, not available on macOS CI")
|
|
}
|
|
db = testutil.SetupTestDB()
|
|
var err error
|
|
registry, err = NewNodeRegistry(db)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
|
|
// Use a 30-second stale threshold for tests.
|
|
// Pass nil db to avoid advisory lock path (no distributed mode in tests).
|
|
hm = NewHealthMonitor(registry, nil, 15*time.Second, 30*time.Second, "", false)
|
|
})
|
|
|
|
makeNode := func(name, address string, vram uint64) *BackendNode {
|
|
return &BackendNode{
|
|
Name: name,
|
|
NodeType: NodeTypeBackend,
|
|
Address: address,
|
|
TotalVRAM: vram,
|
|
AvailableVRAM: vram,
|
|
}
|
|
}
|
|
|
|
Describe("doCheckAll", func() {
|
|
It("marks stale node offline", func() {
|
|
node := makeNode("stale-worker", "10.0.0.1:50051", 8_000_000_000)
|
|
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
|
|
Expect(node.Status).To(Equal(StatusHealthy))
|
|
|
|
// Set LastHeartbeat to 2 minutes ago (well beyond 30s threshold)
|
|
staleTime := time.Now().Add(-2 * time.Minute)
|
|
Expect(db.Model(&BackendNode{}).Where("id = ?", node.ID).
|
|
Update("last_heartbeat", staleTime).Error).ToNot(HaveOccurred())
|
|
|
|
hm.doCheckAll(context.Background())
|
|
|
|
fetched, err := registry.Get(context.Background(), node.ID)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
Expect(fetched.Status).To(Equal(StatusOffline))
|
|
})
|
|
|
|
It("skips draining nodes", func() {
|
|
node := makeNode("draining-worker", "10.0.0.2:50051", 8_000_000_000)
|
|
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
|
|
|
|
// Set status to draining
|
|
Expect(db.Model(&BackendNode{}).Where("id = ?", node.ID).
|
|
Update("status", StatusDraining).Error).ToNot(HaveOccurred())
|
|
|
|
// Make heartbeat stale
|
|
staleTime := time.Now().Add(-2 * time.Minute)
|
|
Expect(db.Model(&BackendNode{}).Where("id = ?", node.ID).
|
|
Update("last_heartbeat", staleTime).Error).ToNot(HaveOccurred())
|
|
|
|
hm.doCheckAll(context.Background())
|
|
|
|
fetched, err := registry.Get(context.Background(), node.ID)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
Expect(fetched.Status).To(Equal(StatusDraining))
|
|
})
|
|
|
|
It("skips idle nodes with no loaded models", func() {
|
|
node := makeNode("idle-worker", "10.0.0.3:50051", 8_000_000_000)
|
|
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
|
|
|
|
// Heartbeat is fresh (just registered), no models loaded.
|
|
// doCheckAll should not change status (no gRPC check attempted).
|
|
hm.doCheckAll(context.Background())
|
|
|
|
fetched, err := registry.Get(context.Background(), node.ID)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
Expect(fetched.Status).To(Equal(StatusHealthy))
|
|
})
|
|
|
|
It("recovers unhealthy node when heartbeat is fresh", func() {
|
|
node := makeNode("unhealthy-worker", "10.0.0.5:50051", 8_000_000_000)
|
|
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
|
|
Expect(node.Status).To(Equal(StatusHealthy))
|
|
|
|
// Mark unhealthy
|
|
Expect(registry.MarkUnhealthy(context.Background(), node.ID)).To(Succeed())
|
|
fetched, err := registry.Get(context.Background(), node.ID)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
Expect(fetched.Status).To(Equal(StatusUnhealthy))
|
|
|
|
// Refresh heartbeat (simulates the worker sending a heartbeat)
|
|
Expect(db.Model(&BackendNode{}).Where("id = ?", node.ID).
|
|
Update("last_heartbeat", time.Now()).Error).ToNot(HaveOccurred())
|
|
|
|
hm.doCheckAll(context.Background())
|
|
|
|
fetched, err = registry.Get(context.Background(), node.ID)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
Expect(fetched.Status).To(Equal(StatusHealthy))
|
|
})
|
|
|
|
It("does not change healthy nodes with fresh heartbeat", func() {
|
|
node := makeNode("fresh-worker", "10.0.0.4:50051", 8_000_000_000)
|
|
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
|
|
|
|
// Update heartbeat to now so it is definitely fresh
|
|
Expect(db.Model(&BackendNode{}).Where("id = ?", node.ID).
|
|
Update("last_heartbeat", time.Now()).Error).ToNot(HaveOccurred())
|
|
|
|
hm.doCheckAll(context.Background())
|
|
|
|
fetched, err := registry.Get(context.Background(), node.ID)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
Expect(fetched.Status).To(Equal(StatusHealthy))
|
|
})
|
|
})
|
|
})
|
|
|
|
// --- Mock-based tests (no DB required) ---
|
|
|
|
var _ = Describe("HealthMonitor (mock-based)", func() {
|
|
const staleThreshold = 30 * time.Second
|
|
|
|
Describe("doCheckAll", func() {
|
|
It("marks stale node offline when autoOffline=true", func() {
|
|
store := newFakeNodeHealthStore()
|
|
factory := newFakeBackendClientFactory()
|
|
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
|
|
|
|
node := makeTestNode("node-1", "stale-worker", "10.0.0.1:50051", StatusHealthy, staleTime(staleThreshold))
|
|
store.addNode(node)
|
|
|
|
hm.doCheckAll(context.Background())
|
|
|
|
Expect(store.getNode("node-1").Status).To(Equal(StatusOffline))
|
|
Expect(store.getCalls()).To(ContainElement("MarkOffline:node-1"))
|
|
})
|
|
|
|
It("marks stale node unhealthy when autoOffline=false", func() {
|
|
store := newFakeNodeHealthStore()
|
|
factory := newFakeBackendClientFactory()
|
|
hm := newTestHealthMonitor(store, factory, false, staleThreshold)
|
|
|
|
node := makeTestNode("node-2", "stale-worker-2", "10.0.0.2:50051", StatusHealthy, staleTime(staleThreshold))
|
|
store.addNode(node)
|
|
|
|
hm.doCheckAll(context.Background())
|
|
|
|
Expect(store.getNode("node-2").Status).To(Equal(StatusUnhealthy))
|
|
Expect(store.getCalls()).To(ContainElement("MarkUnhealthy:node-2"))
|
|
})
|
|
|
|
It("skips draining nodes", func() {
|
|
store := newFakeNodeHealthStore()
|
|
factory := newFakeBackendClientFactory()
|
|
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
|
|
|
|
node := makeTestNode("node-3", "draining-worker", "10.0.0.3:50051", StatusDraining, staleTime(staleThreshold))
|
|
store.addNode(node)
|
|
|
|
hm.doCheckAll(context.Background())
|
|
|
|
// Should remain draining -- no MarkOffline or MarkUnhealthy
|
|
Expect(store.getNode("node-3").Status).To(Equal(StatusDraining))
|
|
calls := store.getCalls()
|
|
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkOffline")))
|
|
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkUnhealthy")))
|
|
})
|
|
|
|
It("skips idle nodes with no models", func() {
|
|
store := newFakeNodeHealthStore()
|
|
factory := newFakeBackendClientFactory()
|
|
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
|
|
|
|
node := makeTestNode("node-4", "idle-worker", "10.0.0.4:50051", StatusHealthy, freshTime())
|
|
store.addNode(node)
|
|
// No models added for this node
|
|
|
|
hm.doCheckAll(context.Background())
|
|
|
|
// Should remain healthy -- no gRPC check attempted
|
|
Expect(store.getNode("node-4").Status).To(Equal(StatusHealthy))
|
|
calls := store.getCalls()
|
|
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkUnhealthy")))
|
|
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkOffline")))
|
|
})
|
|
|
|
It("keeps node healthy when heartbeat is fresh (with models loaded)", func() {
|
|
store := newFakeNodeHealthStore()
|
|
factory := newFakeBackendClientFactory()
|
|
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
|
|
|
|
node := makeTestNode("node-5", "active-worker", "10.0.0.5:50051", StatusHealthy, freshTime())
|
|
store.addNode(node)
|
|
store.addNodeModel("node-5", NodeModel{NodeID: "node-5", ModelName: "llama-7b"})
|
|
|
|
// No gRPC client needed — health is determined by heartbeat, not gRPC probe
|
|
hm.doCheckAll(context.Background())
|
|
|
|
Expect(store.getNode("node-5").Status).To(Equal(StatusHealthy))
|
|
calls := store.getCalls()
|
|
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkUnhealthy")))
|
|
})
|
|
|
|
It("recovers unhealthy node when heartbeat is fresh", func() {
|
|
store := newFakeNodeHealthStore()
|
|
factory := newFakeBackendClientFactory()
|
|
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
|
|
|
|
node := makeTestNode("node-6", "recovering-worker", "10.0.0.6:50051", StatusUnhealthy, freshTime())
|
|
store.addNode(node)
|
|
|
|
hm.doCheckAll(context.Background())
|
|
|
|
Expect(store.getCalls()).To(ContainElement("MarkHealthy:node-6"))
|
|
Expect(store.getNode("node-6").Status).To(Equal(StatusHealthy))
|
|
})
|
|
|
|
It("node stays healthy when gRPC backend crashes but heartbeat is fresh", func() {
|
|
store := newFakeNodeHealthStore()
|
|
factory := newFakeBackendClientFactory()
|
|
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
|
|
|
|
// Worker has a model loaded but the backend process crashed —
|
|
// node should remain healthy because heartbeat is fresh
|
|
node := makeTestNode("node-crash", "crash-worker", "10.0.0.9:50051", StatusHealthy, freshTime())
|
|
store.addNode(node)
|
|
store.addNodeModel("node-crash", NodeModel{NodeID: "node-crash", ModelName: "piper-model", Address: "10.0.0.9:50053"})
|
|
|
|
// gRPC backend is dead — but health is heartbeat-based, not gRPC-based
|
|
factory.setClient("10.0.0.9:50051", &fakeBackendClient{healthy: false, err: fmt.Errorf("connection refused")})
|
|
|
|
hm.doCheckAll(context.Background())
|
|
|
|
Expect(store.getNode("node-crash").Status).To(Equal(StatusHealthy))
|
|
calls := store.getCalls()
|
|
Expect(calls).NotTo(ContainElement(ContainSubstring("MarkUnhealthy")))
|
|
})
|
|
|
|
It("removes stale model via per-model health check after consecutive failures", func() {
|
|
store := newFakeNodeHealthStore()
|
|
factory := newFakeBackendClientFactory()
|
|
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
|
|
hm.perModelHealthCheck = true
|
|
|
|
node := makeTestNode("node-model", "model-worker", "10.0.0.10:50051", StatusHealthy, freshTime())
|
|
store.addNode(node)
|
|
store.addNodeModel("node-model", NodeModel{NodeID: "node-model", ModelName: "piper-model", Address: "10.0.0.10:50053"})
|
|
|
|
// Model backend is dead
|
|
factory.setClient("10.0.0.10:50053", &fakeBackendClient{healthy: false, err: fmt.Errorf("connection refused")})
|
|
|
|
// First (perModelMissThreshold-1) probes must NOT remove the row —
|
|
// a single failure could be a transient blip.
|
|
for i := 0; i < perModelMissThreshold-1; i++ {
|
|
hm.doCheckAll(context.Background())
|
|
Expect(store.getCalls()).NotTo(ContainElement(ContainSubstring("RemoveNodeModel")),
|
|
"removed too early at miss %d", i+1)
|
|
}
|
|
|
|
// Threshold-th consecutive miss triggers removal.
|
|
hm.doCheckAll(context.Background())
|
|
|
|
// Node should remain healthy — only the specific replica record is removed.
|
|
Expect(store.getNode("node-model").Status).To(Equal(StatusHealthy))
|
|
Expect(store.getCalls()).To(ContainElement("RemoveNodeModel:node-model:piper-model:0"))
|
|
Expect(store.getCalls()).NotTo(ContainElement(ContainSubstring("MarkUnhealthy")))
|
|
})
|
|
|
|
It("preserves model row when an intermittent failure is followed by a success", func() {
|
|
store := newFakeNodeHealthStore()
|
|
factory := newFakeBackendClientFactory()
|
|
hm := newTestHealthMonitor(store, factory, true, staleThreshold)
|
|
hm.perModelHealthCheck = true
|
|
|
|
node := makeTestNode("node-flap", "flap-worker", "10.0.0.11:50051", StatusHealthy, freshTime())
|
|
store.addNode(node)
|
|
store.addNodeModel("node-flap", NodeModel{NodeID: "node-flap", ModelName: "piper-model", Address: "10.0.0.11:50053"})
|
|
|
|
deadClient := &fakeBackendClient{healthy: false, err: fmt.Errorf("connection refused")}
|
|
liveClient := &fakeBackendClient{healthy: true}
|
|
|
|
// Two failing probes then a recovery — should NOT remove the row,
|
|
// and should reset the miss counter so two more failures don't tip
|
|
// it over.
|
|
factory.setClient("10.0.0.11:50053", deadClient)
|
|
hm.doCheckAll(context.Background())
|
|
hm.doCheckAll(context.Background())
|
|
factory.setClient("10.0.0.11:50053", liveClient)
|
|
hm.doCheckAll(context.Background())
|
|
|
|
Expect(store.getCalls()).NotTo(ContainElement(ContainSubstring("RemoveNodeModel")))
|
|
|
|
// Counter is reset; two more failures must not be enough to remove.
|
|
factory.setClient("10.0.0.11:50053", deadClient)
|
|
hm.doCheckAll(context.Background())
|
|
hm.doCheckAll(context.Background())
|
|
Expect(store.getCalls()).NotTo(ContainElement(ContainSubstring("RemoveNodeModel")))
|
|
})
|
|
})
|
|
})
|