mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-29 03:24:49 -04:00
fix(distributed): preserve UI-added node labels across worker re-register
The register endpoint called SetNodeLabels(req.Labels) — replace-all semantics — so every worker re-register wiped every label not in the worker's body. The bug existed since labels were introduced in PR #9186 (Mar 31), but only triggered for workers that supplied labels via --node-labels. PR #9583 (the multi-replica refactor) added an auto-mirrored `node.replica-slots` label to every worker's registration body, which made `len(req.Labels) > 0` always true — turning a latent edge-case bug into a universal one. Operators reported "labels assigned to node do not persist": labels survived until the next worker restart, then disappeared. Fix: iterate req.Labels and call SetNodeLabel (upsert) for each instead of SetNodeLabels (delete-then-recreate). Worker-managed labels still refresh on re-register; UI-added labels survive. Trade-off: an operator who removes a label from --node-labels won't have it auto-removed from the DB on next register — they can clean it via the UI. Acceptable, since the alternative (current behavior) silently destroys operator state. Regression test added first (TDD): RegisterNodeEndpoint registers a node, the test simulates a UI add via SetNodeLabel, then re-registers with a different worker label set; assertion that the UI-added label survives. Test fails against the broken code, passes against the fix. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: claude-code:opus-4-7 [Edit] [Bash]
This commit is contained in:
@@ -163,10 +163,16 @@ func RegisterNodeEndpoint(registry *nodes.NodeRegistry, expectedToken string, au
|
||||
return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "failed to register node"))
|
||||
}
|
||||
|
||||
// Store static labels from worker and apply auto-labels
|
||||
if len(req.Labels) > 0 {
|
||||
if err := registry.SetNodeLabels(ctx, node.ID, req.Labels); err != nil {
|
||||
xlog.Warn("Failed to set node labels", "node", node.ID, "error", err)
|
||||
// Merge worker-supplied labels into the node's existing label set,
|
||||
// then apply auto-labels. Using SetNodeLabels here would have
|
||||
// delete-then-recreate semantics — every worker re-register would
|
||||
// wipe any UI-added label (since PR #9583 the worker always sends
|
||||
// the auto-mirror `node.replica-slots`, which made the latent bug
|
||||
// from PR #9186 trigger universally instead of only for workers
|
||||
// with `--node-labels` set).
|
||||
for k, v := range req.Labels {
|
||||
if err := registry.SetNodeLabel(ctx, node.ID, k, v); err != nil {
|
||||
xlog.Warn("Failed to set node label", "node", node.ID, "key", k, "error", err)
|
||||
}
|
||||
}
|
||||
registry.ApplyAutoLabels(ctx, node.ID, node)
|
||||
|
||||
@@ -180,6 +180,54 @@ var _ = Describe("Node HTTP handlers", func() {
|
||||
Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
|
||||
Expect(resp["status"]).To(Equal(nodes.StatusPending))
|
||||
})
|
||||
|
||||
// Regression: a worker re-register used to wipe every UI-added label
|
||||
// because the endpoint called SetNodeLabels (replace-all) with only
|
||||
// what the worker sent. Operators reported "labels assigned to node
|
||||
// do not persist" — the labels survived until the next worker
|
||||
// restart, then disappeared.
|
||||
It("preserves UI-added labels across worker re-register", func() {
|
||||
ctx := context.Background()
|
||||
e := echo.New()
|
||||
|
||||
// 1. Worker first-registers with one label.
|
||||
body1 := `{"name":"worker-merge","address":"10.0.0.50:50051","labels":{"tier":"a","gpu":"a100"}}`
|
||||
req1 := httptest.NewRequest(http.MethodPost, "/", strings.NewReader(body1))
|
||||
req1.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON)
|
||||
rec1 := httptest.NewRecorder()
|
||||
handler := RegisterNodeEndpoint(registry, "", true, nil, "")
|
||||
Expect(handler(e.NewContext(req1, rec1))).To(Succeed())
|
||||
Expect(rec1.Code).To(Equal(http.StatusCreated))
|
||||
|
||||
node, err := registry.GetByName(ctx, "worker-merge")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(node).ToNot(BeNil())
|
||||
|
||||
// 2. Operator adds a label via the UI.
|
||||
Expect(registry.SetNodeLabel(ctx, node.ID, "owner", "ettore")).To(Succeed())
|
||||
|
||||
// 3. Worker restarts and re-registers, sending its own labels
|
||||
// (different from the UI-added one).
|
||||
body2 := `{"name":"worker-merge","address":"10.0.0.50:50051","labels":{"tier":"b","gpu":"a100"}}`
|
||||
req2 := httptest.NewRequest(http.MethodPost, "/", strings.NewReader(body2))
|
||||
req2.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON)
|
||||
rec2 := httptest.NewRecorder()
|
||||
Expect(handler(e.NewContext(req2, rec2))).To(Succeed())
|
||||
Expect(rec2.Code).To(Equal(http.StatusCreated))
|
||||
|
||||
// 4. Assert the UI-added label survived AND the worker labels updated.
|
||||
labels, err := registry.GetNodeLabels(ctx, node.ID)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
byKey := map[string]string{}
|
||||
for _, l := range labels {
|
||||
byKey[l.Key] = l.Value
|
||||
}
|
||||
Expect(byKey).To(HaveKeyWithValue("owner", "ettore"),
|
||||
"UI-added label must survive a worker re-register")
|
||||
Expect(byKey).To(HaveKeyWithValue("tier", "b"),
|
||||
"worker label updates must apply on re-register")
|
||||
Expect(byKey).To(HaveKeyWithValue("gpu", "a100"))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("ListNodesEndpoint", func() {
|
||||
|
||||
Reference in New Issue
Block a user