Files
LocalAI/core/services/nodes/reconciler_test.go
LocalAI [bot] 7637f8cf1b feat(distributed): declarative per-model scheduling via env/args (#10308)
* feat(distributed): add SpreadAll column and authoritative scheduling seeding

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(distributed): parse declarative model scheduling config (env/file)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(distributed): reconcile spread_all to one replica per matching node

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(distributed): wire LOCALAI_MODEL_SCHEDULING env/args and startup seeding

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(distributed): expose spread_all on the scheduling API endpoint

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(distributed): add spread-to-all-nodes mode to the scheduling UI

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs(distributed): document LOCALAI_MODEL_SCHEDULING env/args

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs(distributed): clarify replica modes and all-nodes spread in scheduling config

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-13 18:31:06 +02:00

896 lines
34 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package nodes
import (
"context"
"errors"
"runtime"
"time"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/mudler/LocalAI/core/services/nodes/prefixcache"
"github.com/mudler/LocalAI/core/services/testutil"
"gorm.io/gorm"
)
// ---------------------------------------------------------------------------
// Fake ModelScheduler
// ---------------------------------------------------------------------------
type fakeScheduler struct {
scheduleNode *BackendNode
scheduleErr error
scheduleCalls []scheduleCall
}
type scheduleCall struct {
modelName string
candidateIDs []string
}
func (f *fakeScheduler) ScheduleAndLoadModel(_ context.Context, modelName string, candidateNodeIDs []string) (*BackendNode, error) {
f.scheduleCalls = append(f.scheduleCalls, scheduleCall{modelName, candidateNodeIDs})
return f.scheduleNode, f.scheduleErr
}
func mustGetSched(r *NodeRegistry, model string) ModelSchedulingConfig {
cfg, err := r.GetModelScheduling(context.Background(), model)
Expect(err).ToNot(HaveOccurred())
Expect(cfg).ToNot(BeNil())
return *cfg
}
var _ = Describe("ReplicaReconciler", func() {
var (
db *gorm.DB
registry *NodeRegistry
)
BeforeEach(func() {
if runtime.GOOS == "darwin" {
Skip("testcontainers requires Docker, not available on macOS CI")
}
db = testutil.SetupTestDB()
var err error
registry, err = NewNodeRegistry(db)
Expect(err).ToNot(HaveOccurred())
})
// Helper to register a healthy node with enough replica capacity for
// most tests. Pre-PR4 the reconciler ignored capacity, so existing
// fixtures didn't bother setting MaxReplicasPerModel — bumping the
// default here keeps the test intent ("scale up enough") working under
// the new capacity-aware logic. Tests that specifically exercise the
// circuit breaker should register nodes with a tighter cap.
registerNode := func(name, address string) *BackendNode {
node := &BackendNode{
Name: name,
NodeType: NodeTypeBackend,
Address: address,
MaxReplicasPerModel: 4,
}
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
return node
}
// Helper to set up a scheduling config.
setSchedulingConfig := func(modelName string, minReplicas, maxReplicas int, nodeSelector string) {
cfg := &ModelSchedulingConfig{
ModelName: modelName,
MinReplicas: minReplicas,
MaxReplicas: maxReplicas,
NodeSelector: nodeSelector,
}
Expect(registry.SetModelScheduling(context.Background(), cfg)).To(Succeed())
}
Context("spread_all mode", func() {
It("targets one replica per matching node (empty selector = all nodes)", func() {
n1 := registerNode("s1", "10.1.0.1:50051")
registerNode("s2", "10.1.0.2:50051")
// spread config, no selector -> all healthy backend nodes (2)
Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
ModelName: "spread-model", SpreadAll: true,
})).To(Succeed())
scheduler := &fakeScheduler{scheduleNode: n1}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
})
reconciler.reconcileModel(context.Background(), mustGetSched(registry, "spread-model"))
// With current==0 and a target of 2, the MinReplicas floor path
// schedules up to cluster capacity (2 nodes).
Expect(len(scheduler.scheduleCalls)).To(Equal(2))
})
It("is a no-op when no nodes match", func() {
Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
ModelName: "spread-model", SpreadAll: true,
NodeSelector: `{"tier":"nope"}`,
})).To(Succeed())
scheduler := &fakeScheduler{}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
})
reconciler.reconcileModel(context.Background(), mustGetSched(registry, "spread-model"))
Expect(scheduler.scheduleCalls).To(BeEmpty())
})
})
Context("model below min_replicas", func() {
It("scales up to min_replicas", func() {
node := registerNode("node-1", "10.0.0.1:50051")
setSchedulingConfig("model-a", 2, 4, "")
scheduler := &fakeScheduler{
scheduleNode: node,
}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
})
// No replicas loaded — should schedule 2
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(HaveLen(2))
Expect(scheduler.scheduleCalls[0].modelName).To(Equal("model-a"))
Expect(scheduler.scheduleCalls[1].modelName).To(Equal("model-a"))
})
})
Context("all replicas busy and below max_replicas", func() {
It("scales up by 1", func() {
node := registerNode("node-busy", "10.0.0.2:50051")
setSchedulingConfig("model-b", 1, 4, "")
// Load 2 replicas, both busy (in_flight > 0)
Expect(registry.SetNodeModel(context.Background(), node.ID, "model-b", 0, "loaded", "addr1", 0)).To(Succeed())
Expect(registry.IncrementInFlight(context.Background(), node.ID, "model-b", 0)).To(Succeed())
node2 := registerNode("node-busy-2", "10.0.0.3:50051")
Expect(registry.SetNodeModel(context.Background(), node2.ID, "model-b", 0, "loaded", "addr2", 0)).To(Succeed())
Expect(registry.IncrementInFlight(context.Background(), node2.ID, "model-b", 0)).To(Succeed())
scheduler := &fakeScheduler{
scheduleNode: node,
}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
})
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(HaveLen(1))
Expect(scheduler.scheduleCalls[0].modelName).To(Equal("model-b"))
})
})
Context("all replicas busy and at max_replicas", func() {
It("does not scale up", func() {
node := registerNode("node-max", "10.0.0.4:50051")
setSchedulingConfig("model-c", 1, 2, "")
// Load 2 replicas (at max), both busy
Expect(registry.SetNodeModel(context.Background(), node.ID, "model-c", 0, "loaded", "addr1", 0)).To(Succeed())
Expect(registry.IncrementInFlight(context.Background(), node.ID, "model-c", 0)).To(Succeed())
node2 := registerNode("node-max-2", "10.0.0.5:50051")
Expect(registry.SetNodeModel(context.Background(), node2.ID, "model-c", 0, "loaded", "addr2", 0)).To(Succeed())
Expect(registry.IncrementInFlight(context.Background(), node2.ID, "model-c", 0)).To(Succeed())
scheduler := &fakeScheduler{
scheduleNode: node,
}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
})
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(BeEmpty())
})
})
Context("idle replicas above min_replicas", func() {
It("scales down after idle delay", func() {
node1 := registerNode("node-idle-1", "10.0.0.6:50051")
node2 := registerNode("node-idle-2", "10.0.0.7:50051")
node3 := registerNode("node-idle-3", "10.0.0.8:50051")
setSchedulingConfig("model-d", 1, 4, "")
// Load 3 replicas, all idle with last_used in the past
pastTime := time.Now().Add(-10 * time.Minute)
for _, n := range []*BackendNode{node1, node2, node3} {
Expect(registry.SetNodeModel(context.Background(), n.ID, "model-d", 0, "loaded", "", 0)).To(Succeed())
// Set last_used to past time to trigger scale-down
db.Model(&NodeModel{}).Where("node_id = ? AND model_name = ?", n.ID, "model-d").
Update("last_used", pastTime)
}
unloader := &fakeUnloader{}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Unloader: unloader,
DB: db,
ScaleDownDelay: 1 * time.Minute, // short delay for test
})
reconciler.reconcile(context.Background())
// Should scale down 2 replicas (3 - floor of 1)
Expect(unloader.unloadCalls).To(HaveLen(2))
})
})
Context("idle replicas at min_replicas", func() {
It("does not scale down", func() {
node1 := registerNode("node-keep-1", "10.0.0.9:50051")
node2 := registerNode("node-keep-2", "10.0.0.10:50051")
setSchedulingConfig("model-e", 2, 4, "")
// Load exactly 2 replicas (at min), both idle with past last_used
pastTime := time.Now().Add(-10 * time.Minute)
for _, n := range []*BackendNode{node1, node2} {
Expect(registry.SetNodeModel(context.Background(), n.ID, "model-e", 0, "loaded", "", 0)).To(Succeed())
db.Model(&NodeModel{}).Where("node_id = ? AND model_name = ?", n.ID, "model-e").
Update("last_used", pastTime)
}
unloader := &fakeUnloader{}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Unloader: unloader,
DB: db,
ScaleDownDelay: 1 * time.Minute,
})
reconciler.reconcile(context.Background())
Expect(unloader.unloadCalls).To(BeEmpty())
})
})
Context("model with node_selector", func() {
It("passes candidate node IDs to scheduler", func() {
node1 := registerNode("gpu-node", "10.0.0.11:50051")
node2 := registerNode("cpu-node", "10.0.0.12:50051")
// Add labels — only node1 matches the selector
Expect(registry.SetNodeLabel(context.Background(), node1.ID, "gpu.vendor", "nvidia")).To(Succeed())
Expect(registry.SetNodeLabel(context.Background(), node2.ID, "gpu.vendor", "none")).To(Succeed())
setSchedulingConfig("model-f", 1, 2, `{"gpu.vendor":"nvidia"}`)
scheduler := &fakeScheduler{
scheduleNode: node1,
}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
})
// No replicas loaded — should schedule 1 with candidate node IDs
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(HaveLen(1))
Expect(scheduler.scheduleCalls[0].modelName).To(Equal("model-f"))
Expect(scheduler.scheduleCalls[0].candidateIDs).To(ContainElement(node1.ID))
Expect(scheduler.scheduleCalls[0].candidateIDs).ToNot(ContainElement(node2.ID))
})
})
Describe("Forced-disturb pressure autoscale (Phase 6)", func() {
It("scales up when pressure exceeds threshold, replicas<max, and capacity exists", func() {
// One node with spare slots, one loaded idle replica (so the
// all-busy path does not fire). Pressure for the model is above the
// threshold, which is the only reason to scale here.
node := registerNode("pressure-node", "10.0.0.60:50051")
Expect(registry.SetNodeModel(context.Background(), node.ID, "pressure-model", 0, "loaded", "addr1", 0)).To(Succeed())
setSchedulingConfig("pressure-model", 1, 4, "")
pressure := prefixcache.NewPressure(time.Minute)
pressure.Record("pressure-model", time.Now())
scheduler := &fakeScheduler{scheduleNode: node}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
Pressure: pressure,
})
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(HaveLen(1),
"forced-disturb pressure above threshold must trigger a scale-up")
Expect(scheduler.scheduleCalls[0].modelName).To(Equal("pressure-model"))
})
It("does not scale up on pressure when already at max_replicas", func() {
// Two nodes, both loaded (idle), MaxReplicas=2 → at max. Pressure is
// high but MaxReplicas must never be overridden.
node1 := registerNode("pmax-1", "10.0.0.61:50051")
node2 := registerNode("pmax-2", "10.0.0.62:50051")
Expect(registry.SetNodeModel(context.Background(), node1.ID, "pmax-model", 0, "loaded", "addr1", 0)).To(Succeed())
Expect(registry.SetNodeModel(context.Background(), node2.ID, "pmax-model", 0, "loaded", "addr2", 0)).To(Succeed())
setSchedulingConfig("pmax-model", 1, 2, "")
pressure := prefixcache.NewPressure(time.Minute)
pressure.Record("pmax-model", time.Now())
pressure.Record("pmax-model", time.Now())
scheduler := &fakeScheduler{scheduleNode: node1}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
Pressure: pressure,
})
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(BeEmpty(),
"pressure must never override MaxReplicas")
})
It("consumes the pressure signal so a single burst scales up only once", func() {
// A single burst of forced-disturbs (well within the window) must
// trigger exactly ONE pressure scale-up. A subsequent tick, with the
// SAME events still in-window, must NOT scale again: the first
// scale-up consumed (Reset) the signal. Without the fix, the
// non-draining Count keeps returning >= threshold every tick and
// drives the model toward MaxReplicas off a single burst.
node := registerNode("consume-node", "10.0.0.64:50051")
Expect(registry.SetNodeModel(context.Background(), node.ID, "consume-model", 0, "loaded", "addr1", 0)).To(Succeed())
setSchedulingConfig("consume-model", 1, 4, "")
pressure := prefixcache.NewPressure(time.Minute)
now := time.Now()
pressure.Record("consume-model", now)
pressure.Record("consume-model", now)
pressure.Record("consume-model", now)
scheduler := &fakeScheduler{scheduleNode: node}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
Pressure: pressure,
})
// First tick: pressure above threshold → one scale-up.
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(HaveLen(1),
"first tick must scale up once on the burst")
// Second tick: the burst's events are still inside the window, but
// the first scale-up Reset them, so no further scale-up occurs.
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(HaveLen(1),
"a single burst must not re-trigger scale-up on the next in-window tick")
})
It("does not consume the pressure signal when scaleUp fails", func() {
// Pressure above threshold and capacity exists, but the scheduler
// errors so no replica is actually added. The forced-disturb signal
// must be preserved (NOT Reset) so the next tick retries the
// scale-up off the same accumulated pressure, instead of having to
// re-accumulate a full window of forced-disturbs from scratch.
node := registerNode("fail-node", "10.0.0.66:50051")
Expect(registry.SetNodeModel(context.Background(), node.ID, "fail-model", 0, "loaded", "addr1", 0)).To(Succeed())
setSchedulingConfig("fail-model", 1, 4, "")
pressure := prefixcache.NewPressure(time.Minute)
now := time.Now()
pressure.Record("fail-model", now)
pressure.Record("fail-model", now)
pressure.Record("fail-model", now)
Expect(pressure.Count("fail-model", time.Now())).To(BeNumerically(">=", 1))
// Scheduler errors: scaleUp attempts but adds nothing.
scheduler := &fakeScheduler{scheduleErr: errors.New("schedule boom")}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
Pressure: pressure,
})
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(HaveLen(1),
"scaleUp must have attempted exactly one schedule call")
Expect(pressure.Count("fail-model", time.Now())).To(BeNumerically(">=", 1),
"a failed scaleUp must NOT consume (Reset) the pressure signal — next tick should retry")
})
It("consumes the pressure signal only when scaleUp succeeds", func() {
// Mirror of the failure case: when the scheduler succeeds and a
// replica is actually added, the forced-disturb signal IS consumed
// (Reset to 0) so a single burst scales up only once.
node := registerNode("ok-node", "10.0.0.67:50051")
Expect(registry.SetNodeModel(context.Background(), node.ID, "ok-model", 0, "loaded", "addr1", 0)).To(Succeed())
setSchedulingConfig("ok-model", 1, 4, "")
pressure := prefixcache.NewPressure(time.Minute)
now := time.Now()
pressure.Record("ok-model", now)
pressure.Record("ok-model", now)
pressure.Record("ok-model", now)
scheduler := &fakeScheduler{scheduleNode: node}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
Pressure: pressure,
})
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(HaveLen(1),
"successful scaleUp must have scheduled one replica")
Expect(pressure.Count("ok-model", time.Now())).To(Equal(0),
"a successful scaleUp must consume (Reset) the pressure signal to 0")
})
It("performs at most one scale-up per tick when both busy and over pressure", func() {
// The single loaded replica is busy (all-replicas-busy fires) AND
// pressure is above threshold. Both scale-up paths are eligible in
// the same tick. The invariant is at-most-one scaleUp(+1) per tick,
// so exactly one schedule call must happen, not two.
node := registerNode("dual-node", "10.0.0.65:50051")
Expect(registry.SetNodeModel(context.Background(), node.ID, "dual-model", 0, "loaded", "addr1", 0)).To(Succeed())
Expect(registry.IncrementInFlight(context.Background(), node.ID, "dual-model", 0)).To(Succeed())
setSchedulingConfig("dual-model", 1, 4, "")
pressure := prefixcache.NewPressure(time.Minute)
pressure.Record("dual-model", time.Now())
pressure.Record("dual-model", time.Now())
scheduler := &fakeScheduler{scheduleNode: node}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
Pressure: pressure,
})
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(HaveLen(1),
"busy + pressure in one tick must still scale up by exactly one, not two")
})
It("does not spin when pressure is high but no capacity exists", func() {
// Single node, cap 1, already loaded → capacity 0. Pressure is high
// but there is nowhere to place a replica: must not call scheduler.
registerCappedNodeFn := func(name, address string, cap int) *BackendNode {
node := &BackendNode{
Name: name,
NodeType: NodeTypeBackend,
Address: address,
MaxReplicasPerModel: cap,
}
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
return node
}
node := registerCappedNodeFn("pcap-node", "10.0.0.63:50051", 1)
Expect(registry.SetNodeModel(context.Background(), node.ID, "pcap-model", 0, "loaded", "addr1", 0)).To(Succeed())
// MaxReplicas high enough that replicas<max, so only capacity guards it.
setSchedulingConfig("pcap-model", 1, 4, "")
pressure := prefixcache.NewPressure(time.Minute)
pressure.Record("pcap-model", time.Now())
scheduler := &fakeScheduler{scheduleNode: node}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
Pressure: pressure,
})
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(BeEmpty(),
"no capacity means no scale-up: must not spin the scheduler")
})
})
Describe("Capacity gating + circuit breaker (PR4)", func() {
// Helper: register a node with an explicit per-model replica cap.
// Tests in this Describe block want to exercise both "fits" and
// "doesn't fit" capacity scenarios precisely.
registerCappedNode := func(name, address string, cap int) *BackendNode {
node := &BackendNode{
Name: name,
NodeType: NodeTypeBackend,
Address: address,
MaxReplicasPerModel: cap,
}
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
return node
}
It("caps scale-up at cluster capacity instead of looping forever", func() {
// 1 node × 1 slot = capacity 1, but MinReplicas=2.
// Pre-PR4 this looped: every 30s "scaling up to meet minimum"
// because the registry never grew to 2. Post-PR4 the reconciler
// does the math up front and only schedules 1 (the achievable
// target), then flags unsatisfiable on the next ticks.
node := registerCappedNode("cap-1-slot", "10.0.0.40:50051", 1)
Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
ModelName: "tight-model",
MinReplicas: 2,
})).To(Succeed())
scheduler := &fakeScheduler{scheduleNode: node}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
})
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(HaveLen(1),
"only 1 schedule call: capacity is 1, not the requested 2 — must not loop")
})
It("flags unsatisfiable after threshold consecutive ticks at capacity 0", func() {
// 1 node × 1 slot, already loaded. Capacity=0, but MinReplicas=2.
// Each tick increments UnsatisfiableTicks; once we cross the
// threshold the cooldown timestamp is set and further ticks
// short-circuit (the scheduler is no longer called).
node := registerCappedNode("cb-node", "10.0.0.41:50051", 1)
Expect(registry.SetNodeModel(context.Background(), node.ID, "cb-model", 0, "loaded", "addr1", 0)).To(Succeed())
Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
ModelName: "cb-model",
MinReplicas: 2,
})).To(Succeed())
scheduler := &fakeScheduler{scheduleNode: node}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
})
// Drive enough ticks to cross the threshold, plus a couple more
// to confirm the cooldown holds.
for i := 0; i < unsatisfiableTickThreshold+2; i++ {
reconciler.reconcile(context.Background())
}
cfg, err := registry.GetModelScheduling(context.Background(), "cb-model")
Expect(err).ToNot(HaveOccurred())
Expect(cfg.UnsatisfiableUntil).ToNot(BeNil(),
"reconciler must flag the config after threshold ticks of capacity exhaustion")
Expect(cfg.UnsatisfiableUntil.After(time.Now())).To(BeTrue(),
"cooldown must point to the future")
// Capacity 0 + cooldown active means the scheduler shouldn't have
// been invoked at all — capacity was 0 from the first tick.
Expect(scheduler.scheduleCalls).To(BeEmpty(),
"capacity was always 0 — no schedule attempts should have been made")
})
It("clears unsatisfiable on a successful scale-up", func() {
// Pre-flag the config (simulate a prior unsatisfiable run), then
// register enough capacity and tick — the reconciler must clear
// the flag and proceed.
node := registerCappedNode("clear-node", "10.0.0.42:50051", 4)
until := time.Now().Add(-1 * time.Second) // already-expired cooldown
Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
ModelName: "clear-model",
MinReplicas: 1,
UnsatisfiableTicks: 5,
UnsatisfiableUntil: &until,
})).To(Succeed())
scheduler := &fakeScheduler{scheduleNode: node}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
})
reconciler.reconcile(context.Background())
Expect(scheduler.scheduleCalls).To(HaveLen(1),
"expired cooldown should not block scheduling")
cfg, err := registry.GetModelScheduling(context.Background(), "clear-model")
Expect(err).ToNot(HaveOccurred())
Expect(cfg.UnsatisfiableUntil).To(BeNil(), "successful scale-up must clear the cooldown")
Expect(cfg.UnsatisfiableTicks).To(Equal(0), "successful scale-up must reset the counter")
})
It("recovers when a new node joins (ClearAllUnsatisfiable on Register)", func() {
// One full node, then config flagged unsatisfiable. Adding a
// second node simulates the user's recovery question: capacity
// returns, cooldown clears, the next tick schedules.
node1 := registerCappedNode("rec-node-1", "10.0.0.43:50051", 1)
Expect(registry.SetNodeModel(context.Background(), node1.ID, "rec-model", 0, "loaded", "addr1", 0)).To(Succeed())
until := time.Now().Add(unsatisfiableCooldown)
Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
ModelName: "rec-model",
MinReplicas: 2,
UnsatisfiableTicks: unsatisfiableTickThreshold,
UnsatisfiableUntil: &until,
})).To(Succeed())
// New node registers — this is the recovery event.
registerCappedNode("rec-node-2", "10.0.0.44:50051", 1)
cfg, err := registry.GetModelScheduling(context.Background(), "rec-model")
Expect(err).ToNot(HaveOccurred())
Expect(cfg.UnsatisfiableUntil).To(BeNil(),
"Register must clear unsatisfiable flags so the reconciler retries")
Expect(cfg.UnsatisfiableTicks).To(Equal(0))
})
It("recovers when node labels change (ClearAllUnsatisfiable on label ops)", func() {
node := registerCappedNode("lbl-node", "10.0.0.45:50051", 1)
until := time.Now().Add(unsatisfiableCooldown)
Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
ModelName: "lbl-model",
MinReplicas: 2,
UnsatisfiableTicks: unsatisfiableTickThreshold,
UnsatisfiableUntil: &until,
})).To(Succeed())
// Adding a label could change which models the node matches via
// a NodeSelector, so capacity for some config may have just
// changed. ClearAllUnsatisfiable lets the reconciler re-check.
Expect(registry.SetNodeLabel(context.Background(), node.ID, "tier", "fast")).To(Succeed())
cfg, err := registry.GetModelScheduling(context.Background(), "lbl-model")
Expect(err).ToNot(HaveOccurred())
Expect(cfg.UnsatisfiableUntil).To(BeNil())
})
})
Describe("ClusterCapacityForModel", func() {
It("sums (max_replicas_per_model - replicas[node, model]) over candidates", func() {
// Three nodes with caps 4, 2, 1. Loaded counts: 1, 0, 1 → free
// slots: 3, 2, 0 → total capacity 5.
a := &BackendNode{Name: "cap-a", NodeType: NodeTypeBackend, Address: "10.0.0.50:50051", MaxReplicasPerModel: 4}
b := &BackendNode{Name: "cap-b", NodeType: NodeTypeBackend, Address: "10.0.0.51:50051", MaxReplicasPerModel: 2}
c := &BackendNode{Name: "cap-c", NodeType: NodeTypeBackend, Address: "10.0.0.52:50051", MaxReplicasPerModel: 1}
Expect(registry.Register(context.Background(), a, true)).To(Succeed())
Expect(registry.Register(context.Background(), b, true)).To(Succeed())
Expect(registry.Register(context.Background(), c, true)).To(Succeed())
Expect(registry.SetNodeModel(context.Background(), a.ID, "cap-model", 0, "loaded", "x", 0)).To(Succeed())
Expect(registry.SetNodeModel(context.Background(), c.ID, "cap-model", 0, "loaded", "y", 0)).To(Succeed())
cap, err := registry.ClusterCapacityForModel(context.Background(), "cap-model", nil)
Expect(err).ToNot(HaveOccurred())
Expect(cap).To(Equal(5))
// Restricting to {b, c}: b free=2, c free=0 → capacity 2.
cap, err = registry.ClusterCapacityForModel(context.Background(), "cap-model", []string{b.ID, c.ID})
Expect(err).ToNot(HaveOccurred())
Expect(cap).To(Equal(2))
})
})
Context("frontend-restart scenario (Bug-1)", func() {
It("recovers replicas after every NodeModel row has been removed", func() {
ctx := context.Background()
// One healthy node. UpsertModelLoadInfo records per-model metadata
// independently of any NodeModel row, mirroring what
// scheduleAndLoad does on a successful dispatch.
node := registerNode("restart-node", "10.0.20.1:50051")
setSchedulingConfig("restart-model", 2, 4, "")
Expect(registry.UpsertModelLoadInfo(ctx, "restart-model", "llama-cpp", []byte("opts-from-pre-restart"))).To(Succeed())
// Simulate the bug: between frontend instances the NodeModel rows
// are wiped (MarkOffline path, stale-heartbeat reaping). The
// per-model load info row stays because it's not tied to any
// (node, replica) slot.
Expect(registry.RemoveAllNodeModelReplicas(ctx, node.ID, "restart-model")).To(Succeed())
// Pre-fix: GetModelLoadInfo returned ErrRecordNotFound here and
// the reconciler logged "no load info" every 30s until a manual
// inference request arrived.
bt, blob, err := registry.GetModelLoadInfo(ctx, "restart-model")
Expect(err).ToNot(HaveOccurred())
Expect(bt).To(Equal("llama-cpp"))
Expect(blob).To(Equal([]byte("opts-from-pre-restart")))
// And the reconciler tick should now call into the scheduler
// for the missing replicas instead of bailing out.
scheduler := &fakeScheduler{scheduleNode: node}
reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
Scheduler: scheduler,
DB: db,
})
reconciler.reconcile(ctx)
Expect(scheduler.scheduleCalls).ToNot(BeEmpty(),
"reconciler must call the scheduler after a frontend restart that wiped NodeModel rows")
Expect(scheduler.scheduleCalls[0].modelName).To(Equal("restart-model"))
})
})
})
// fakeProber lets tests control whether a model's gRPC address "responds".
type fakeProber struct {
alive map[string]bool
calls int
}
func (f *fakeProber) IsAlive(_ context.Context, address string) bool {
f.calls++
if f.alive == nil {
return false
}
return f.alive[address]
}
var _ = Describe("ReplicaReconciler — state reconciliation", func() {
var (
db *gorm.DB
registry *NodeRegistry
)
BeforeEach(func() {
if runtime.GOOS == "darwin" {
Skip("testcontainers requires Docker, not available on macOS CI")
}
db = testutil.SetupTestDB()
var err error
registry, err = NewNodeRegistry(db)
Expect(err).ToNot(HaveOccurred())
})
Describe("probeLoadedModels", func() {
It("removes loaded models whose gRPC address is unreachable", func() {
node := &BackendNode{Name: "n1", NodeType: NodeTypeBackend, Address: "10.0.0.1:50051"}
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
// Two loaded models — one stale (will probe), one fresh (skipped).
stale := &NodeModel{
ID: "stale-1",
NodeID: node.ID,
ModelName: "stale-model",
Address: "10.0.0.1:12345",
State: "loaded",
UpdatedAt: time.Now().Add(-5 * time.Minute),
}
fresh := &NodeModel{
ID: "fresh-1",
NodeID: node.ID,
ModelName: "fresh-model",
Address: "10.0.0.1:54321",
State: "loaded",
UpdatedAt: time.Now(), // within probeStaleAfter
}
Expect(db.Create(stale).Error).To(Succeed())
Expect(db.Create(fresh).Error).To(Succeed())
prober := &fakeProber{alive: map[string]bool{"10.0.0.1:12345": false}}
rc := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
DB: db,
Prober: prober,
ProbeStaleAfter: 2 * time.Minute,
})
rc.probeLoadedModels(context.Background())
// Stale was unreachable — row removed.
var after []NodeModel
Expect(db.Find(&after).Error).To(Succeed())
Expect(after).To(HaveLen(1))
Expect(after[0].ModelName).To(Equal("fresh-model"))
// Prober was only called once (the fresh row was filtered out).
Expect(prober.calls).To(Equal(1))
})
It("keeps reachable models and bumps their updated_at", func() {
node := &BackendNode{Name: "n1", NodeType: NodeTypeBackend, Address: "10.0.0.1:50051"}
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
stale := &NodeModel{
ID: "stale-2",
NodeID: node.ID,
ModelName: "alive-model",
Address: "10.0.0.1:12345",
State: "loaded",
UpdatedAt: time.Now().Add(-5 * time.Minute),
}
Expect(db.Create(stale).Error).To(Succeed())
prober := &fakeProber{alive: map[string]bool{"10.0.0.1:12345": true}}
rc := NewReplicaReconciler(ReplicaReconcilerOptions{
Registry: registry,
DB: db,
Prober: prober,
ProbeStaleAfter: 2 * time.Minute,
})
rc.probeLoadedModels(context.Background())
var after NodeModel
Expect(db.First(&after, "id = ?", "stale-2").Error).To(Succeed())
Expect(after.UpdatedAt).To(BeTemporally("~", time.Now(), time.Second))
})
})
Describe("UpsertPendingBackendOp + RecordPendingBackendOpFailure", func() {
It("upserts on the composite key rather than duplicating rows", func() {
node := &BackendNode{Name: "n1", NodeType: NodeTypeBackend, Address: "10.0.0.1:50051"}
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
Expect(registry.UpsertPendingBackendOp(context.Background(), node.ID, "foo", OpBackendDelete, nil)).To(Succeed())
// Second call for the same (node, backend, op) should not create a
// new row — that's how re-issuing a delete works.
Expect(registry.UpsertPendingBackendOp(context.Background(), node.ID, "foo", OpBackendDelete, nil)).To(Succeed())
var rows []PendingBackendOp
Expect(db.Find(&rows).Error).To(Succeed())
Expect(rows).To(HaveLen(1))
})
It("increments attempts and moves next_retry_at out on failure", func() {
node := &BackendNode{Name: "n1", NodeType: NodeTypeBackend, Address: "10.0.0.1:50051"}
Expect(registry.Register(context.Background(), node, true)).To(Succeed())
Expect(registry.UpsertPendingBackendOp(context.Background(), node.ID, "foo", OpBackendDelete, nil)).To(Succeed())
var row PendingBackendOp
Expect(db.First(&row).Error).To(Succeed())
before := row.NextRetryAt
Expect(registry.RecordPendingBackendOpFailure(context.Background(), row.ID, "boom")).To(Succeed())
Expect(db.First(&row, row.ID).Error).To(Succeed())
Expect(row.Attempts).To(Equal(1))
Expect(row.LastError).To(Equal("boom"))
Expect(row.NextRetryAt).To(BeTemporally(">", before))
})
})
Describe("NewNodeRegistry malformed-row pruning", func() {
It("drops queue rows for agent nodes and non-existent nodes on startup", func() {
agent := &BackendNode{Name: "agent-1", NodeType: NodeTypeAgent, Address: "x"}
Expect(registry.Register(context.Background(), agent, true)).To(Succeed())
backend := &BackendNode{Name: "backend-1", NodeType: NodeTypeBackend, Address: "y"}
Expect(registry.Register(context.Background(), backend, true)).To(Succeed())
// Three rows: one for a valid backend node (should survive),
// one for an agent node (pruned), one for an empty backend name
// on the valid node (pruned).
Expect(registry.UpsertPendingBackendOp(context.Background(), backend.ID, "foo", OpBackendInstall, nil)).To(Succeed())
Expect(registry.UpsertPendingBackendOp(context.Background(), agent.ID, "foo", OpBackendInstall, nil)).To(Succeed())
Expect(registry.UpsertPendingBackendOp(context.Background(), backend.ID, "", OpBackendInstall, nil)).To(Succeed())
// Re-instantiating the registry runs the cleanup migration.
_, err := NewNodeRegistry(db)
Expect(err).ToNot(HaveOccurred())
var rows []PendingBackendOp
Expect(db.Find(&rows).Error).To(Succeed())
Expect(rows).To(HaveLen(1))
Expect(rows[0].NodeID).To(Equal(backend.ID))
Expect(rows[0].Backend).To(Equal("foo"))
})
})
})