fix(distributed): round-robin replicas of the same model (#9695)

FindAndLockNodeWithModel previously ordered candidate replicas by
in_flight ASC, available_vram DESC. The primary key is correct, but the
tiebreaker meant that whenever in_flight tied — the common case at low
to moderate concurrency where requests don't overlap — the node with
the largest available VRAM won every pick. With autoscaling placing
replicas of the same model on multiple nodes, the fattest GPU node
ended up taking nearly all the load while the others sat idle.

Insert last_used ASC between the two existing tiers. last_used is
already refreshed inside the same transaction that increments in_flight
(and by TouchNodeModel on cache hits in the router), so the
"oldest-used" replica naturally rotates through the candidate set —
strict round-robin without a schema change. available_vram DESC is
demoted to a final tiebreaker for cold starts where last_used is
identical across replicas.

Placement queries (FindNodeWithVRAM, FindLeastLoadedNode, and the
*FromSet variants) have the same fattest-GPU bias on tiebreakers but
are higher-cost to fix consistently. Deferred to a follow-up so the
routing fix can land first — for the user-observed symptom routing was
the dominant cause anyway.

Test: registry_test.go adds a focused spec that loads three replicas
on three nodes with 24/16/8 GB VRAM and asserts each is picked at
least twice across 9 in_flight-tied calls.


Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash] [Grep]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
LocalAI [bot]
2026-05-06 19:40:54 +02:00
committed by GitHub
parent 4e154b59e5
commit 22ff86d64f
2 changed files with 49 additions and 3 deletions

View File

@@ -663,8 +663,16 @@ func (r *NodeRegistry) FindAndLockNodeWithModel(ctx context.Context, modelName s
var node BackendNode
err := r.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
// Order by in_flight ASC (least busy replica), then by available_vram DESC
// (prefer nodes with more free VRAM to spread load across the cluster).
// Order by in_flight ASC (least busy replica), then by last_used ASC
// (round-robin between equally-loaded replicas — oldest used wins, and
// every successful pick refreshes last_used below, so the "oldest" naturally
// rotates through the candidate set). available_vram DESC is the final
// tiebreaker for cold starts where last_used is identical.
//
// Without the last_used tier, a tie on in_flight (the common case at low
// to moderate concurrency where requests don't overlap) collapses to
// "biggest GPU wins every time" and one node ends up taking nearly all
// the load while replicas on other nodes sit idle.
q := tx.Clauses(clause.Locking{Strength: "UPDATE"}).
Joins("JOIN backend_nodes ON backend_nodes.id = node_models.node_id").
Where("node_models.model_name = ? AND node_models.state = ?", modelName, "loaded")
@@ -672,7 +680,7 @@ func (r *NodeRegistry) FindAndLockNodeWithModel(ctx context.Context, modelName s
q = q.Where("node_models.node_id IN ?", candidateNodeIDs)
}
if err := q.
Order("node_models.in_flight ASC, backend_nodes.available_vram DESC").
Order("node_models.in_flight ASC, node_models.last_used ASC, backend_nodes.available_vram DESC").
First(&nm).Error; err != nil {
return err
}

View File

@@ -304,6 +304,44 @@ var _ = Describe("NodeRegistry", func() {
Expect(foundNM.NodeID).To(Equal(included.ID))
})
It("round-robins between replicas when in_flight ties (last_used tiebreaker)", func() {
// Three replicas of the same model on three nodes, all with in_flight=0.
// Without the last_used tiebreaker, the node with the largest available_vram
// would win every pick and one node would take ~all the load. With it,
// each successful pick refreshes last_used so the next pick rotates to
// the oldest-used replica.
fat := makeNode("rr-fat", "10.0.0.50:50051", 24_000_000_000)
mid := makeNode("rr-mid", "10.0.0.51:50051", 16_000_000_000)
small := makeNode("rr-small", "10.0.0.52:50051", 8_000_000_000)
Expect(registry.Register(context.Background(), fat, true)).To(Succeed())
Expect(registry.Register(context.Background(), mid, true)).To(Succeed())
Expect(registry.Register(context.Background(), small, true)).To(Succeed())
Expect(registry.SetNodeModel(context.Background(), fat.ID, "rr-model", 0, "loaded", "", 0)).To(Succeed())
Expect(registry.SetNodeModel(context.Background(), mid.ID, "rr-model", 0, "loaded", "", 0)).To(Succeed())
Expect(registry.SetNodeModel(context.Background(), small.ID, "rr-model", 0, "loaded", "", 0)).To(Succeed())
// Decrement back to 0 after each pick so the next call sees a tie.
// (FindAndLockNodeWithModel atomically increments to lock the row.)
picks := make([]string, 0, 9)
for i := 0; i < 9; i++ {
n, nm, err := registry.FindAndLockNodeWithModel(context.Background(), "rr-model", nil)
Expect(err).ToNot(HaveOccurred())
picks = append(picks, n.Name)
Expect(registry.DecrementInFlight(context.Background(), n.ID, "rr-model", nm.ReplicaIndex)).To(Succeed())
}
// Each replica should have been picked at least twice across 9 ties —
// proves we're rotating, not pinning to the largest-VRAM node.
counts := map[string]int{}
for _, p := range picks {
counts[p]++
}
Expect(counts["rr-fat"]).To(BeNumerically(">=", 2), "fat node was picked %d times across 9 ties: %v", counts["rr-fat"], picks)
Expect(counts["rr-mid"]).To(BeNumerically(">=", 2), "mid node was picked %d times across 9 ties: %v", counts["rr-mid"], picks)
Expect(counts["rr-small"]).To(BeNumerically(">=", 2), "small node was picked %d times across 9 ties: %v", counts["rr-small"], picks)
})
It("returns not-found when the model is loaded only on excluded nodes", func() {
loadedExcluded := makeNode("excl-only-node", "10.0.0.45:50051", 8_000_000_000)
emptyIncluded := makeNode("empty-included-node", "10.0.0.46:50051", 8_000_000_000)