fix(distributed): preserve UI-added node labels across worker re-register

The register endpoint called SetNodeLabels(req.Labels) — replace-all
semantics — so every worker re-register wiped every label not in the
worker's body. The bug existed since labels were introduced in
PR #9186 (Mar 31), but only triggered for workers that supplied
labels via --node-labels.

PR #9583 (the multi-replica refactor) added an auto-mirrored
`node.replica-slots` label to every worker's registration body, which
made `len(req.Labels) > 0` always true — turning a latent edge-case
bug into a universal one. Operators reported "labels assigned to
node do not persist": labels survived until the next worker restart,
then disappeared.

Fix: iterate req.Labels and call SetNodeLabel (upsert) for each
instead of SetNodeLabels (delete-then-recreate). Worker-managed
labels still refresh on re-register; UI-added labels survive.

Trade-off: an operator who removes a label from --node-labels won't
have it auto-removed from the DB on next register — they can clean it
via the UI. Acceptable, since the alternative (current behavior)
silently destroys operator state.

Regression test added first (TDD): RegisterNodeEndpoint registers a
node, the test simulates a UI add via SetNodeLabel, then re-registers
with a different worker label set; assertion that the UI-added label
survives. Test fails against the broken code, passes against the fix.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: claude-code:opus-4-7 [Edit] [Bash]
This commit is contained in:
Ettore Di Giacinto
2026-04-27 21:24:50 +00:00
parent 3280b9a287
commit ea1df8945b
2 changed files with 58 additions and 4 deletions

View File

@@ -163,10 +163,16 @@ func RegisterNodeEndpoint(registry *nodes.NodeRegistry, expectedToken string, au
return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "failed to register node"))
}
// Store static labels from worker and apply auto-labels
if len(req.Labels) > 0 {
if err := registry.SetNodeLabels(ctx, node.ID, req.Labels); err != nil {
xlog.Warn("Failed to set node labels", "node", node.ID, "error", err)
// Merge worker-supplied labels into the node's existing label set,
// then apply auto-labels. Using SetNodeLabels here would have
// delete-then-recreate semantics — every worker re-register would
// wipe any UI-added label (since PR #9583 the worker always sends
// the auto-mirror `node.replica-slots`, which made the latent bug
// from PR #9186 trigger universally instead of only for workers
// with `--node-labels` set).
for k, v := range req.Labels {
if err := registry.SetNodeLabel(ctx, node.ID, k, v); err != nil {
xlog.Warn("Failed to set node label", "node", node.ID, "key", k, "error", err)
}
}
registry.ApplyAutoLabels(ctx, node.ID, node)

View File

@@ -180,6 +180,54 @@ var _ = Describe("Node HTTP handlers", func() {
Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
Expect(resp["status"]).To(Equal(nodes.StatusPending))
})
// Regression: a worker re-register used to wipe every UI-added label
// because the endpoint called SetNodeLabels (replace-all) with only
// what the worker sent. Operators reported "labels assigned to node
// do not persist" — the labels survived until the next worker
// restart, then disappeared.
It("preserves UI-added labels across worker re-register", func() {
ctx := context.Background()
e := echo.New()
// 1. Worker first-registers with one label.
body1 := `{"name":"worker-merge","address":"10.0.0.50:50051","labels":{"tier":"a","gpu":"a100"}}`
req1 := httptest.NewRequest(http.MethodPost, "/", strings.NewReader(body1))
req1.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON)
rec1 := httptest.NewRecorder()
handler := RegisterNodeEndpoint(registry, "", true, nil, "")
Expect(handler(e.NewContext(req1, rec1))).To(Succeed())
Expect(rec1.Code).To(Equal(http.StatusCreated))
node, err := registry.GetByName(ctx, "worker-merge")
Expect(err).ToNot(HaveOccurred())
Expect(node).ToNot(BeNil())
// 2. Operator adds a label via the UI.
Expect(registry.SetNodeLabel(ctx, node.ID, "owner", "ettore")).To(Succeed())
// 3. Worker restarts and re-registers, sending its own labels
// (different from the UI-added one).
body2 := `{"name":"worker-merge","address":"10.0.0.50:50051","labels":{"tier":"b","gpu":"a100"}}`
req2 := httptest.NewRequest(http.MethodPost, "/", strings.NewReader(body2))
req2.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON)
rec2 := httptest.NewRecorder()
Expect(handler(e.NewContext(req2, rec2))).To(Succeed())
Expect(rec2.Code).To(Equal(http.StatusCreated))
// 4. Assert the UI-added label survived AND the worker labels updated.
labels, err := registry.GetNodeLabels(ctx, node.ID)
Expect(err).ToNot(HaveOccurred())
byKey := map[string]string{}
for _, l := range labels {
byKey[l.Key] = l.Value
}
Expect(byKey).To(HaveKeyWithValue("owner", "ettore"),
"UI-added label must survive a worker re-register")
Expect(byKey).To(HaveKeyWithValue("tier", "b"),
"worker label updates must apply on re-register")
Expect(byKey).To(HaveKeyWithValue("gpu", "a100"))
})
})
Describe("ListNodesEndpoint", func() {