mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-31 12:07:45 -04:00
* refactor(distributed): extract PickBestReplica from FindAndLockNodeWithModel Lifts the replica-selection policy (in_flight ASC, last_used ASC, available_vram DESC) out of the SQL ORDER BY into a pure Go function in the new replicapicker.go. The SQL clause keeps its FOR UPDATE atomicity and remains the production path used by SmartRouter; PickBestReplica is the canonical implementation that the future per-frontend rotating replica cache (TODO referenced from pkg/model) will call against an in-memory snapshot without paying a DB round-trip per inference. A new registry_test mirror spec seeds a multi-tier scenario and asserts both layers pick the same replica, so any future tweak to either side fails the test until the other side is updated. No behavior change. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-7 [Claude Code] * fix(distributed): route per inference request and cache probeHealth Two related fixes that together restore load balancing across loaded replicas of the same model. 1. ModelLoader.Load and LoadModel bypass the local *Model cache when modelRouter is set. The cached *Model wraps an InFlightTrackingClient bound to a single (nodeID, replicaIndex) — reusing it pinned every subsequent request to whichever node won the very first pick, so FindAndLockNodeWithModel's round-robin never got a chance to run even after the reconciler scaled the model out to a second node. In distributed mode SmartRouter.Route now runs per request, and PickBestReplica picks the least-loaded replica each time. SmartRouter has its own coalescing (advisory DB lock for first-time loads + singleflight on backend.install RPC) so concurrent first requests for a not-yet-loaded model still produce a single worker side install. 2. SmartRouter.probeHealth memoizes successful gRPC HealthCheck results in a new probeCache (probe_cache.go) with a 30s TTL. With per-request routing every inference call hits probeHealth, and llama.cpp-style backends serialize HealthCheck behind active Predict — so a burst of incoming requests stalled on the probe to a node already mid-stream, tripping the 2s timeout and falling through to the install path. singleflight collapses N concurrent first-time probes for the same (node, addr) into one round-trip, failed probes invalidate the entry so the staleness-recovery path still triggers, and the TTL matches pkg/model/model.go's healthCheckTTL so the single-process and distributed paths share a staleness budget. The background HealthMonitor still reaps actually-dead backends within ~45s. The bypass introduces one short FindAndLockNodeWithModel transaction per inference. A TODO in pkg/model/loader.go documents the future per modelID rotating-replica cache that would reuse PickBestReplica against an in-memory snapshot and skip the DB round-trip for hot paths. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-7 [Claude Code] --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
82 lines
3.5 KiB
Go
82 lines
3.5 KiB
Go
package nodes
|
|
|
|
import (
|
|
"time"
|
|
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
var _ = Describe("PickBestReplica", func() {
|
|
// Use a single reference time so every test that wants identical
|
|
// last_used can share it without relying on time.Now() interleavings.
|
|
ref := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)
|
|
|
|
It("returns nil for an empty candidate list", func() {
|
|
Expect(PickBestReplica(nil)).To(BeNil())
|
|
Expect(PickBestReplica([]ReplicaCandidate{})).To(BeNil())
|
|
})
|
|
|
|
It("returns the only candidate when there is just one", func() {
|
|
only := ReplicaCandidate{NodeID: "only", InFlight: 99, LastUsed: ref, AvailableVRAM: 1}
|
|
pick := PickBestReplica([]ReplicaCandidate{only})
|
|
Expect(pick).ToNot(BeNil())
|
|
Expect(pick.NodeID).To(Equal("only"))
|
|
})
|
|
|
|
It("prefers the replica with the lowest in_flight", func() {
|
|
// Without the in-flight tier, the larger-VRAM node would win.
|
|
cs := []ReplicaCandidate{
|
|
{NodeID: "busy-big", InFlight: 3, LastUsed: ref, AvailableVRAM: 24_000_000_000},
|
|
{NodeID: "idle-small", InFlight: 0, LastUsed: ref, AvailableVRAM: 8_000_000_000},
|
|
{NodeID: "mid", InFlight: 1, LastUsed: ref, AvailableVRAM: 16_000_000_000},
|
|
}
|
|
Expect(PickBestReplica(cs).NodeID).To(Equal("idle-small"))
|
|
})
|
|
|
|
It("uses oldest last_used as the tiebreaker when in_flight ties", func() {
|
|
// All three tied on in_flight=0. Without last_used, available_vram
|
|
// would pin every pick to the fattest node — the exact bug
|
|
// fix(distributed): round-robin replicas of the same model addressed.
|
|
cs := []ReplicaCandidate{
|
|
{NodeID: "fat-recent", InFlight: 0, LastUsed: ref.Add(2 * time.Second), AvailableVRAM: 24_000_000_000},
|
|
{NodeID: "small-oldest", InFlight: 0, LastUsed: ref, AvailableVRAM: 8_000_000_000},
|
|
{NodeID: "mid-middle", InFlight: 0, LastUsed: ref.Add(1 * time.Second), AvailableVRAM: 16_000_000_000},
|
|
}
|
|
Expect(PickBestReplica(cs).NodeID).To(Equal("small-oldest"))
|
|
})
|
|
|
|
It("uses largest available_vram as the final tiebreaker", func() {
|
|
// in_flight tied AND last_used tied — pick the largest GPU.
|
|
cs := []ReplicaCandidate{
|
|
{NodeID: "small", InFlight: 0, LastUsed: ref, AvailableVRAM: 8_000_000_000},
|
|
{NodeID: "fat", InFlight: 0, LastUsed: ref, AvailableVRAM: 24_000_000_000},
|
|
{NodeID: "mid", InFlight: 0, LastUsed: ref, AvailableVRAM: 16_000_000_000},
|
|
}
|
|
Expect(PickBestReplica(cs).NodeID).To(Equal("fat"))
|
|
})
|
|
|
|
It("respects tier precedence: in_flight beats last_used beats available_vram", func() {
|
|
// "fat-busy-oldest" wins on neither of the first two tiers; the
|
|
// "small-idle-recent" replica is busy=0 and should beat it despite
|
|
// being newer and smaller.
|
|
cs := []ReplicaCandidate{
|
|
{NodeID: "fat-busy-oldest", InFlight: 5, LastUsed: ref, AvailableVRAM: 80_000_000_000},
|
|
{NodeID: "small-idle-recent", InFlight: 0, LastUsed: ref.Add(time.Hour), AvailableVRAM: 4_000_000_000},
|
|
}
|
|
Expect(PickBestReplica(cs).NodeID).To(Equal("small-idle-recent"))
|
|
})
|
|
|
|
It("is stable: returns the first candidate when every field ties", func() {
|
|
// betterReplica returns false on a full tie, so the leading element
|
|
// remains best. Callers shouldn't depend on this for correctness,
|
|
// but pinning the behavior here catches accidental reorderings.
|
|
cs := []ReplicaCandidate{
|
|
{NodeID: "first", InFlight: 0, LastUsed: ref, AvailableVRAM: 8_000_000_000},
|
|
{NodeID: "second", InFlight: 0, LastUsed: ref, AvailableVRAM: 8_000_000_000},
|
|
{NodeID: "third", InFlight: 0, LastUsed: ref, AvailableVRAM: 8_000_000_000},
|
|
}
|
|
Expect(PickBestReplica(cs).NodeID).To(Equal("first"))
|
|
})
|
|
})
|