From 3280b9a2870aa5aebc0aa46fe6e566446c646878 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 27 Apr 2026 20:55:24 +0000 Subject: [PATCH] fix(distributed): per-replica backend logs (store aggregation + UI) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The multi-replica refactor (PR #9583) changed the worker's process key from `modelID` to `modelID#replicaIndex`, but the BackendLogStore kept the bare-modelID lookup. Result: every distributed deployment lost backend logs in the Nodes UI — single-replica too, since even the default capacity of 1 produces a `#0` suffix. Two changes wired together: * pkg/model: BackendLogStore.GetLines/Subscribe now treat a modelID without `#` as a model prefix and merge across all `modelID#N` replica buffers (timestamp-sorted for GetLines; fan-in for Subscribe). Calls with a full `modelID#N` key resolve exactly. ListModels strips replica suffixes and deduplicates so the listing surfaces one entry per loaded model. * react-ui: per-replica log streams as the default. Loaded Models table disambiguates each row with a `rep N` pill (only when the node hosts >1 replica of a model). Each row's "View logs" link routes to the per-replica process key so operators see only that replica's output. The logs page renders the replica context as a chip in the title and surfaces a segmented control — `Replica 0 / 1 / … / All merged` — when the model has multiple replicas; the merged segment uses the bare-modelID URL (delegating to the store's prefix aggregation) for the side-by-side comparison case. Single-replica deployments see no extra UI. Tests added first (TDD): the regression set in backend_log_store_test.go reproduces the bug at the exact failure point — GetLines/ListModels/Subscribe assertions all fail against the broken code, all pass against the fix. TestSubscribe_PerReplicaFilter pins the exact-key path so a future change can't silently break it. Signed-off-by: Ettore Di Giacinto Assisted-by: claude-code:opus-4-7 [Edit] [Skill:critique] [Skill:audit] [Skill:polish] [Skill:distill] --- .../react-ui/src/pages/NodeBackendLogs.jsx | 102 +++++++++- core/http/react-ui/src/pages/Nodes.jsx | 45 ++++- pkg/model/backend_log_store.go | 189 +++++++++++++++--- pkg/model/backend_log_store_test.go | 140 +++++++++++++ 4 files changed, 443 insertions(+), 33 deletions(-) create mode 100644 pkg/model/backend_log_store_test.go diff --git a/core/http/react-ui/src/pages/NodeBackendLogs.jsx b/core/http/react-ui/src/pages/NodeBackendLogs.jsx index 4110713df..58e798233 100644 --- a/core/http/react-ui/src/pages/NodeBackendLogs.jsx +++ b/core/http/react-ui/src/pages/NodeBackendLogs.jsx @@ -1,5 +1,5 @@ import { useState, useEffect, useCallback, useRef, useMemo } from 'react' -import { useParams, useOutletContext, Link } from 'react-router-dom' +import { useParams, useOutletContext, Link, useNavigate } from 'react-router-dom' import { nodesApi } from '../utils/api' import { formatTimestamp } from '../utils/format' import { apiUrl } from '../utils/basePath' @@ -19,6 +19,16 @@ export default function NodeBackendLogs() { const { nodeId, modelId: rawModelId } = useParams() const modelId = decodeURIComponent(rawModelId || '') const { addToast } = useOutletContext() + const navigate = useNavigate() + + // The route param can be a bare model name ("qwen3-0.6b") OR a per-replica + // process key ("qwen3-0.6b#0"). The worker's BackendLogStore treats them + // differently — bare = aggregate across replicas, suffixed = exact replica. + // Surface that distinction so operators know what they're looking at. + const replicaSepIdx = modelId.indexOf('#') + const baseModelName = replicaSepIdx >= 0 ? modelId.slice(0, replicaSepIdx) : modelId + const replicaIndex = replicaSepIdx >= 0 ? parseInt(modelId.slice(replicaSepIdx + 1), 10) : null + const isMerged = replicaIndex === null const [lines, setLines] = useState([]) const [loading, setLoading] = useState(true) @@ -27,6 +37,10 @@ export default function NodeBackendLogs() { const [showDetails, setShowDetails] = useState(true) const [wsConnected, setWsConnected] = useState(false) const [nodeName, setNodeName] = useState('') + // Replicas of this base model on this node — drives whether the + // merged-vs-replica toggle is rendered. Single-replica deployments + // never see the toggle (no decision to make). + const [replicas, setReplicas] = useState([]) const logContainerRef = useRef(null) const wsRef = useRef(null) const reconnectTimerRef = useRef(null) @@ -43,6 +57,22 @@ export default function NodeBackendLogs() { } }, [nodeId]) + // Fetch the replica list for this base model on this node so we know + // whether to render the merged-vs-replica toggle. Cheap query; runs once + // per (nodeId, baseModelName) change. + useEffect(() => { + if (!nodeId || !baseModelName) return + nodesApi.getModels(nodeId) + .then(arr => { + const reps = (Array.isArray(arr) ? arr : []) + .filter(m => m.model_name === baseModelName) + .map(m => m.replica_index ?? 0) + .sort((a, b) => a - b) + setReplicas(reps) + }) + .catch(() => setReplicas([])) + }, [nodeId, baseModelName]) + // Auto-scroll to bottom when new lines arrive useEffect(() => { if (autoScroll && logContainerRef.current) { @@ -139,13 +169,54 @@ export default function NodeBackendLogs() { ) } + // Show the merged/per-replica toggle only when this model has > 1 replica + // on this node. Single-replica deployments don't see a control they can't + // meaningfully use. + const showReplicaToggle = replicas.length > 1 + return (

- {modelId} + {baseModelName} + {!isMerged && ( + + replica {replicaIndex} + + )} + {isMerged && replicas.length > 1 && ( + + merged · {replicas.length} replicas + + )}

Backend logs from node {nodeName || nodeId} @@ -154,6 +225,33 @@ export default function NodeBackendLogs() {

+ {showReplicaToggle && ( +
+ {replicas.map(idx => ( + + ))} +