fix(distributed): round-robin replicas of the same model (#9695)

FindAndLockNodeWithModel previously ordered candidate replicas by in_flight ASC, available_vram DESC. The primary key is correct, but the tiebreaker meant that whenever in_flight tied — the common case at low to moderate concurrency where requests don't overlap — the node with the largest available VRAM won every pick. With autoscaling placing replicas of the same model on multiple nodes, the fattest GPU node ended up taking nearly all the load while the others sat idle. Insert last_used ASC between the two existing tiers. last_used is already refreshed inside the same transaction that increments in_flight (and by TouchNodeModel on cache hits in the router), so the "oldest-used" replica naturally rotates through the candidate set — strict round-robin without a schema change. available_vram DESC is demoted to a final tiebreaker for cold starts where last_used is identical across replicas. Placement queries (FindNodeWithVRAM, FindLeastLoadedNode, and the *FromSet variants) have the same fattest-GPU bias on tiebreakers but are higher-cost to fix consistently. Deferred to a follow-up so the routing fix can land first — for the user-observed symptom routing was the dominant cause anyway. Test: registry_test.go adds a focused spec that loads three replicas on three nodes with 24/16/8 GB VRAM and asserts each is picked at least twice across 9 in_flight-tied calls. Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash] [Grep] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-17 04:56:52 -04:00 · 2026-05-06 19:40:54 +02:00
parent 4e154b59e5
commit 22ff86d64f
2 changed files with 49 additions and 3 deletions
--- a/core/services/nodes/registry.go
+++ b/core/services/nodes/registry.go
@@ -663,8 +663,16 @@ func (r *NodeRegistry) FindAndLockNodeWithModel(ctx context.Context, modelName s
 	var node BackendNode

 	err := r.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
-		// Order by in_flight ASC (least busy replica), then by available_vram DESC
-		// (prefer nodes with more free VRAM to spread load across the cluster).
+		// Order by in_flight ASC (least busy replica), then by last_used ASC
+		// (round-robin between equally-loaded replicas — oldest used wins, and
+		// every successful pick refreshes last_used below, so the "oldest" naturally
+		// rotates through the candidate set). available_vram DESC is the final
+		// tiebreaker for cold starts where last_used is identical.
+		//
+		// Without the last_used tier, a tie on in_flight (the common case at low
+		// to moderate concurrency where requests don't overlap) collapses to
+		// "biggest GPU wins every time" and one node ends up taking nearly all
+		// the load while replicas on other nodes sit idle.
 		q := tx.Clauses(clause.Locking{Strength: "UPDATE"}).
 			Joins("JOIN backend_nodes ON backend_nodes.id = node_models.node_id").
 			Where("node_models.model_name = ? AND node_models.state = ?", modelName, "loaded")
@@ -672,7 +680,7 @@ func (r *NodeRegistry) FindAndLockNodeWithModel(ctx context.Context, modelName s
 			q = q.Where("node_models.node_id IN ?", candidateNodeIDs)
 		}
 		if err := q.
-			Order("node_models.in_flight ASC, backend_nodes.available_vram DESC").
+			Order("node_models.in_flight ASC, node_models.last_used ASC, backend_nodes.available_vram DESC").
 			First(&nm).Error; err != nil {
 			return err
 		}
--- a/core/services/nodes/registry_test.go
+++ b/core/services/nodes/registry_test.go
@@ -304,6 +304,44 @@ var _ = Describe("NodeRegistry", func() {
 			Expect(foundNM.NodeID).To(Equal(included.ID))
 		})

+		It("round-robins between replicas when in_flight ties (last_used tiebreaker)", func() {
+			// Three replicas of the same model on three nodes, all with in_flight=0.
+			// Without the last_used tiebreaker, the node with the largest available_vram
+			// would win every pick and one node would take ~all the load. With it,
+			// each successful pick refreshes last_used so the next pick rotates to
+			// the oldest-used replica.
+			fat := makeNode("rr-fat", "10.0.0.50:50051", 24_000_000_000)
+			mid := makeNode("rr-mid", "10.0.0.51:50051", 16_000_000_000)
+			small := makeNode("rr-small", "10.0.0.52:50051", 8_000_000_000)
+			Expect(registry.Register(context.Background(), fat, true)).To(Succeed())
+			Expect(registry.Register(context.Background(), mid, true)).To(Succeed())
+			Expect(registry.Register(context.Background(), small, true)).To(Succeed())
+
+			Expect(registry.SetNodeModel(context.Background(), fat.ID, "rr-model", 0, "loaded", "", 0)).To(Succeed())
+			Expect(registry.SetNodeModel(context.Background(), mid.ID, "rr-model", 0, "loaded", "", 0)).To(Succeed())
+			Expect(registry.SetNodeModel(context.Background(), small.ID, "rr-model", 0, "loaded", "", 0)).To(Succeed())
+
+			// Decrement back to 0 after each pick so the next call sees a tie.
+			// (FindAndLockNodeWithModel atomically increments to lock the row.)
+			picks := make([]string, 0, 9)
+			for i := 0; i < 9; i++ {
+				n, nm, err := registry.FindAndLockNodeWithModel(context.Background(), "rr-model", nil)
+				Expect(err).ToNot(HaveOccurred())
+				picks = append(picks, n.Name)
+				Expect(registry.DecrementInFlight(context.Background(), n.ID, "rr-model", nm.ReplicaIndex)).To(Succeed())
+			}
+
+			// Each replica should have been picked at least twice across 9 ties —
+			// proves we're rotating, not pinning to the largest-VRAM node.
+			counts := map[string]int{}
+			for _, p := range picks {
+				counts[p]++
+			}
+			Expect(counts["rr-fat"]).To(BeNumerically(">=", 2), "fat node was picked %d times across 9 ties: %v", counts["rr-fat"], picks)
+			Expect(counts["rr-mid"]).To(BeNumerically(">=", 2), "mid node was picked %d times across 9 ties: %v", counts["rr-mid"], picks)
+			Expect(counts["rr-small"]).To(BeNumerically(">=", 2), "small node was picked %d times across 9 ties: %v", counts["rr-small"], picks)
+		})
+
 		It("returns not-found when the model is loaded only on excluded nodes", func() {
 			loadedExcluded := makeNode("excl-only-node", "10.0.0.45:50051", 8_000_000_000)
 			emptyIncluded := makeNode("empty-included-node", "10.0.0.46:50051", 8_000_000_000)