From ea1df8945b0514c4342749a53fcf4965f85dd5ed Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 27 Apr 2026 21:24:50 +0000 Subject: [PATCH] fix(distributed): preserve UI-added node labels across worker re-register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The register endpoint called SetNodeLabels(req.Labels) — replace-all semantics — so every worker re-register wiped every label not in the worker's body. The bug existed since labels were introduced in PR #9186 (Mar 31), but only triggered for workers that supplied labels via --node-labels. PR #9583 (the multi-replica refactor) added an auto-mirrored `node.replica-slots` label to every worker's registration body, which made `len(req.Labels) > 0` always true — turning a latent edge-case bug into a universal one. Operators reported "labels assigned to node do not persist": labels survived until the next worker restart, then disappeared. Fix: iterate req.Labels and call SetNodeLabel (upsert) for each instead of SetNodeLabels (delete-then-recreate). Worker-managed labels still refresh on re-register; UI-added labels survive. Trade-off: an operator who removes a label from --node-labels won't have it auto-removed from the DB on next register — they can clean it via the UI. Acceptable, since the alternative (current behavior) silently destroys operator state. Regression test added first (TDD): RegisterNodeEndpoint registers a node, the test simulates a UI add via SetNodeLabel, then re-registers with a different worker label set; assertion that the UI-added label survives. Test fails against the broken code, passes against the fix. Signed-off-by: Ettore Di Giacinto Assisted-by: claude-code:opus-4-7 [Edit] [Bash] --- core/http/endpoints/localai/nodes.go | 14 +++++-- core/http/endpoints/localai/nodes_test.go | 48 +++++++++++++++++++++++ 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/core/http/endpoints/localai/nodes.go b/core/http/endpoints/localai/nodes.go index 947e440ce..17cccef9a 100644 --- a/core/http/endpoints/localai/nodes.go +++ b/core/http/endpoints/localai/nodes.go @@ -163,10 +163,16 @@ func RegisterNodeEndpoint(registry *nodes.NodeRegistry, expectedToken string, au return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "failed to register node")) } - // Store static labels from worker and apply auto-labels - if len(req.Labels) > 0 { - if err := registry.SetNodeLabels(ctx, node.ID, req.Labels); err != nil { - xlog.Warn("Failed to set node labels", "node", node.ID, "error", err) + // Merge worker-supplied labels into the node's existing label set, + // then apply auto-labels. Using SetNodeLabels here would have + // delete-then-recreate semantics — every worker re-register would + // wipe any UI-added label (since PR #9583 the worker always sends + // the auto-mirror `node.replica-slots`, which made the latent bug + // from PR #9186 trigger universally instead of only for workers + // with `--node-labels` set). + for k, v := range req.Labels { + if err := registry.SetNodeLabel(ctx, node.ID, k, v); err != nil { + xlog.Warn("Failed to set node label", "node", node.ID, "key", k, "error", err) } } registry.ApplyAutoLabels(ctx, node.ID, node) diff --git a/core/http/endpoints/localai/nodes_test.go b/core/http/endpoints/localai/nodes_test.go index 526056cda..f9eb8071b 100644 --- a/core/http/endpoints/localai/nodes_test.go +++ b/core/http/endpoints/localai/nodes_test.go @@ -180,6 +180,54 @@ var _ = Describe("Node HTTP handlers", func() { Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed()) Expect(resp["status"]).To(Equal(nodes.StatusPending)) }) + + // Regression: a worker re-register used to wipe every UI-added label + // because the endpoint called SetNodeLabels (replace-all) with only + // what the worker sent. Operators reported "labels assigned to node + // do not persist" — the labels survived until the next worker + // restart, then disappeared. + It("preserves UI-added labels across worker re-register", func() { + ctx := context.Background() + e := echo.New() + + // 1. Worker first-registers with one label. + body1 := `{"name":"worker-merge","address":"10.0.0.50:50051","labels":{"tier":"a","gpu":"a100"}}` + req1 := httptest.NewRequest(http.MethodPost, "/", strings.NewReader(body1)) + req1.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON) + rec1 := httptest.NewRecorder() + handler := RegisterNodeEndpoint(registry, "", true, nil, "") + Expect(handler(e.NewContext(req1, rec1))).To(Succeed()) + Expect(rec1.Code).To(Equal(http.StatusCreated)) + + node, err := registry.GetByName(ctx, "worker-merge") + Expect(err).ToNot(HaveOccurred()) + Expect(node).ToNot(BeNil()) + + // 2. Operator adds a label via the UI. + Expect(registry.SetNodeLabel(ctx, node.ID, "owner", "ettore")).To(Succeed()) + + // 3. Worker restarts and re-registers, sending its own labels + // (different from the UI-added one). + body2 := `{"name":"worker-merge","address":"10.0.0.50:50051","labels":{"tier":"b","gpu":"a100"}}` + req2 := httptest.NewRequest(http.MethodPost, "/", strings.NewReader(body2)) + req2.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON) + rec2 := httptest.NewRecorder() + Expect(handler(e.NewContext(req2, rec2))).To(Succeed()) + Expect(rec2.Code).To(Equal(http.StatusCreated)) + + // 4. Assert the UI-added label survived AND the worker labels updated. + labels, err := registry.GetNodeLabels(ctx, node.ID) + Expect(err).ToNot(HaveOccurred()) + byKey := map[string]string{} + for _, l := range labels { + byKey[l.Key] = l.Value + } + Expect(byKey).To(HaveKeyWithValue("owner", "ettore"), + "UI-added label must survive a worker re-register") + Expect(byKey).To(HaveKeyWithValue("tier", "b"), + "worker label updates must apply on re-register") + Expect(byKey).To(HaveKeyWithValue("gpu", "a100")) + }) }) Describe("ListNodesEndpoint", func() {