From 2da1a4d2306b2b75a16666352aee89f19dd90dab Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 26 Apr 2026 22:05:18 +0000 Subject: [PATCH] feat(distributed): per-node backend installation from the gallery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In distributed mode the Backends gallery used to fan every install out to every worker — fine for auto-resolving (meta) backends like llama-cpp where each node picks its own variant, but wrong for hardware-specific builds like cpu-llama-cpp that would silently land on every GPU node. Adds a node-targeted install path through the existing POST /api/nodes/:id/backends/install plumbing, with two entry points: - Backends gallery row gets a split-button in distributed mode. Auto- resolving keeps "Install on all nodes" as the primary; chevron menu opens the picker. Hardware-specific routes the primary directly to the picker — no fan-out path on the row. - Nodes-page drawer gets a "+ Add backend" button that navigates to /app/backends?target=; the gallery scopes itself to that node (banner, single per-row install button, Reinstall/Remove for already- installed). One gallery, two scopes — no second UI to maintain. The picker (new NodeInstallPicker) shows a 3-state suitability column (Compatible / Override / Installed), an auto-expanding variant override disclosure that fires when selected nodes have no working GPU, parallel per-node installs with inline status and Retry-failed-nodes, and a mismatch confirm that names the consequence on the button itself. A 409 fan-out guard on /api/backends/apply protects CLI/Terraform/script users from the same footgun: hardware-specific installs in distributed mode now return code "concrete_backend_requires_target" with a human- readable error and a meta_alternative pointer. The gallery list payload now surfaces capabilities, metaBackendFor and per-row nodes (NodeBackendRef) so the picker and the new Nodes column have everything they need without re-walking the gallery client-side. GODEBUG=netdns=go is set on the compose services because the cgo DNS resolver follows the container's nsswitch.conf to host systemd-resolved (127.0.0.53), unreachable from inside the container; the pure-Go resolver reads /etc/resolv.conf directly and uses Docker's embedded DNS. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude Code:claude-opus-4-7[1m] [Edit] [Bash] [Read] [Write] --- core/http/endpoints/localai/backend.go | 74 +- core/http/endpoints/localai/nodes.go | 20 +- .../src/components/NodeInstallPicker.jsx | 668 ++++++++++++++++++ .../react-ui/src/hooks/useDistributedMode.js | 40 ++ core/http/react-ui/src/pages/Backends.jsx | 282 +++++++- core/http/react-ui/src/pages/Nodes.jsx | 28 +- core/http/react-ui/src/utils/api.js | 12 +- core/http/routes/localai.go | 2 +- core/http/routes/ui_api.go | 65 +- core/services/galleryop/managers.go | 5 + core/services/galleryop/managers_local.go | 2 + core/services/nodes/managers_distributed.go | 5 + .../nodes/managers_distributed_test.go | 1 + docker-compose.distributed.yaml | 9 + 14 files changed, 1172 insertions(+), 41 deletions(-) create mode 100644 core/http/react-ui/src/components/NodeInstallPicker.jsx create mode 100644 core/http/react-ui/src/hooks/useDistributedMode.js diff --git a/core/http/endpoints/localai/backend.go b/core/http/endpoints/localai/backend.go index bf6e4b8c2..3a2d660f7 100644 --- a/core/http/endpoints/localai/backend.go +++ b/core/http/endpoints/localai/backend.go @@ -98,7 +98,7 @@ func (mgs *BackendEndpointService) GetAllStatusEndpoint() echo.HandlerFunc { // @Param request body GalleryBackend true "query params" // @Success 200 {object} schema.BackendResponse "Response" // @Router /backends/apply [post] -func (mgs *BackendEndpointService) ApplyBackendEndpoint() echo.HandlerFunc { +func (mgs *BackendEndpointService) ApplyBackendEndpoint(systemState *system.SystemState) echo.HandlerFunc { return func(c echo.Context) error { input := new(GalleryBackend) // Get input data from the request body @@ -106,6 +106,18 @@ func (mgs *BackendEndpointService) ApplyBackendEndpoint() echo.HandlerFunc { return err } + // In distributed mode, refuse to fan out a hardware-specific build to + // every node — a CPU build landing on a GPU cluster is almost always + // wrong, and the silent footgun is exactly what this guard exists for. + // Auto-resolving (meta) backends are fine because each node picks its + // own variant. Tooling can recover by hitting + // POST /api/nodes/{id}/backends/install per target node. + if mgs.backendApplier.BackendManager().IsDistributed() && input.ID != "" { + if guard := concreteFanOutGuard(c, mgs.galleries, systemState, input.ID); guard != nil { + return guard + } + } + uuid, err := uuid.NewUUID() if err != nil { return err @@ -120,6 +132,66 @@ func (mgs *BackendEndpointService) ApplyBackendEndpoint() echo.HandlerFunc { } } +// concreteFanOutGuard returns a 409 response if the requested backend is a +// hardware-specific build (not auto-resolving / meta) and we are in +// distributed mode. It looks up the backend in the configured galleries; if +// the lookup itself fails (gallery unreachable, name not found), the guard +// stays out of the way and lets the install enqueue normally — a missing +// name will surface from the worker as a clearer error than the guard could +// produce here. The response body deliberately speaks human, with `code` and +// `meta_alternative` as the programmatic contract for tooling. +func concreteFanOutGuard(c echo.Context, galleries []config.Gallery, systemState *system.SystemState, backendID string) error { + // Use the unfiltered listing because in distributed mode the frontend's + // hardware is irrelevant — the install targets workers, not us — and the + // filtered list would hide variants that don't match the frontend host + // (e.g. a CUDA build on a CPU-only frontend), preventing the guard from + // firing for exactly the cases it's meant to protect against. + available, err := gallery.AvailableBackendsUnfiltered(galleries, systemState) + if err != nil { + return nil + } + requested := available.FindByName(backendID) + if requested == nil || requested.IsMeta() { + return nil + } + + // Try to find an auto-resolving (meta) backend that has this concrete + // variant in its CapabilitiesMap, so we can suggest it as a one-shot + // alternative. Optional — empty string is fine if no parent exists. + metaAlternative := "" + for _, b := range available { + if !b.IsMeta() { + continue + } + for _, concrete := range b.CapabilitiesMap { + if concrete == backendID { + metaAlternative = b.Name + break + } + } + if metaAlternative != "" { + break + } + } + + msg := fmt.Sprintf( + "Backend %q is a hardware-specific build and won't run correctly on every node in this cluster. In distributed mode, install it on specific nodes:\n\n POST /api/nodes/{node_id}/backends/install\n {\"backend\": %q}", + backendID, backendID, + ) + if metaAlternative != "" { + msg += fmt.Sprintf( + "\n\nTo install across all nodes, use the auto-resolving backend %q — each node picks its own variant based on its hardware.", + metaAlternative, + ) + } + + return c.JSON(409, map[string]any{ + "error": msg, + "code": "concrete_backend_requires_target", + "meta_alternative": metaAlternative, + }) +} + // DeleteBackendEndpoint lets delete backends from a LocalAI instance // @Summary delete backends from LocalAI. // @Tags backends diff --git a/core/http/endpoints/localai/nodes.go b/core/http/endpoints/localai/nodes.go index 8eb18bfbe..2225980e4 100644 --- a/core/http/endpoints/localai/nodes.go +++ b/core/http/endpoints/localai/nodes.go @@ -363,6 +363,9 @@ func ResumeNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc { } // InstallBackendOnNodeEndpoint triggers backend installation on a worker node via NATS. +// Backend can be either a gallery ID (resolved against BackendGalleries) or a +// direct URI install (URI + Name + optional Alias) — same shape as the +// standalone /api/backends/install-external path, just scoped to one node. func InstallBackendOnNodeEndpoint(unloader nodes.NodeCommandSender) echo.HandlerFunc { return func(c echo.Context) error { if unloader == nil { @@ -372,17 +375,24 @@ func InstallBackendOnNodeEndpoint(unloader nodes.NodeCommandSender) echo.Handler var req struct { Backend string `json:"backend"` BackendGalleries string `json:"backend_galleries,omitempty"` + URI string `json:"uri,omitempty"` + Name string `json:"name,omitempty"` + Alias string `json:"alias,omitempty"` } - if err := c.Bind(&req); err != nil || req.Backend == "" { - return c.JSON(http.StatusBadRequest, nodeError(http.StatusBadRequest, "backend name required")) + if err := c.Bind(&req); err != nil { + return c.JSON(http.StatusBadRequest, nodeError(http.StatusBadRequest, "invalid request body")) } - reply, err := unloader.InstallBackend(nodeID, req.Backend, "", req.BackendGalleries, "", "", "") + // Either a gallery backend name or a direct URI must be supplied. + if req.Backend == "" && req.URI == "" { + return c.JSON(http.StatusBadRequest, nodeError(http.StatusBadRequest, "backend name or uri required")) + } + reply, err := unloader.InstallBackend(nodeID, req.Backend, "", req.BackendGalleries, req.URI, req.Name, req.Alias) if err != nil { - xlog.Error("Failed to install backend on node", "node", nodeID, "backend", req.Backend, "error", err) + xlog.Error("Failed to install backend on node", "node", nodeID, "backend", req.Backend, "uri", req.URI, "error", err) return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "failed to install backend on node")) } if !reply.Success { - xlog.Error("Backend install failed on node", "node", nodeID, "backend", req.Backend, "error", reply.Error) + xlog.Error("Backend install failed on node", "node", nodeID, "backend", req.Backend, "uri", req.URI, "error", reply.Error) return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "backend installation failed")) } return c.JSON(http.StatusOK, map[string]string{"message": "backend installed"}) diff --git a/core/http/react-ui/src/components/NodeInstallPicker.jsx b/core/http/react-ui/src/components/NodeInstallPicker.jsx new file mode 100644 index 000000000..50fccc2b2 --- /dev/null +++ b/core/http/react-ui/src/components/NodeInstallPicker.jsx @@ -0,0 +1,668 @@ +import { useState, useMemo, useEffect, useRef } from 'react' +import Modal from './Modal' +import SearchableSelect from './SearchableSelect' +import { nodesApi } from '../utils/api' + +// NodeInstallPicker is the single multi-node install surface used both from +// the Backends gallery split-button and from the "Install on more nodes" `+` +// affordance in the Nodes column. Submit fires N parallel per-node install +// calls; rows transition inline so the user sees per-node success/failure +// without leaving the modal. +// +// Props: +// open — controls visibility +// onClose — close handler (header X / Cancel / Esc / backdrop) +// onComplete — fired after at least one node install succeeded; +// gallery uses this to refetch and update the Nodes +// column without a manual reload +// backend — { name, isMeta, capabilities, metaBackendFor } +// nodes — BackendNode[] from /api/nodes +// installedNodeIds — Set/array of node IDs that already have this backend +// initialSelection — optional pre-selected node IDs (e.g. "missing nodes" +// when opened from the Nodes column `+` affordance) + +const STATUS_LABELS = { healthy: 'Healthy', draining: 'Draining', unhealthy: 'Unhealthy', offline: 'Offline' } + +function formatVRAM(bytes) { + if (!bytes || bytes === 0) return null + const gb = bytes / (1024 * 1024 * 1024) + return gb >= 1 ? `${gb.toFixed(1)} GB` : `${(bytes / (1024 * 1024)).toFixed(0)} MB` +} + +function gpuVendorLabel(vendor) { + const labels = { nvidia: 'NVIDIA', amd: 'AMD', intel: 'Intel', vulkan: 'Vulkan' } + return labels[vendor] || null +} + +// hardwareTargetOf parses the capability key that points to a concrete +// variant in the parent meta's CapabilitiesMap. e.g. cpu-llama-cpp comes +// from {"cpu": "cpu-llama-cpp"} → "cpu". Falls back to "" when the parent +// is unknown (the gallery list payload still gives us metaBackendFor). +function hardwareTargetOf(backend, allBackends) { + if (!backend || !backend.name || backend.isMeta) return '' + const parentName = backend.metaBackendFor + if (!parentName) return '' + const parent = (allBackends || []).find(b => b.name === parentName || b.id === parentName) + if (!parent || !parent.capabilities) return '' + for (const [cap, concreteName] of Object.entries(parent.capabilities)) { + if (concreteName === backend.name) return cap + } + return '' +} + +// humanTargetLabel turns a capability key into a user-facing phrase used in +// the picker header note: "CPU build", "CUDA 12 build", etc. Keep it +// concrete and product-recognisable, not the raw token from the gallery. +function humanTargetLabel(target) { + if (!target) return 'hardware-specific build' + const t = target.toLowerCase() + if (t.startsWith('cpu') || t === 'default') return 'CPU build' + if (t.includes('cuda-13') || t.includes('cuda13')) return 'CUDA 13 build' + if (t.includes('cuda-12') || t.includes('cuda12')) return 'CUDA 12 build' + if (t.includes('cuda')) return 'NVIDIA CUDA build' + if (t.includes('l4t')) return 'NVIDIA Jetson (L4T) build' + if (t.includes('nvidia')) return 'NVIDIA build' + if (t.includes('rocm') || t.includes('amd')) return 'AMD ROCm build' + if (t.includes('metal')) return 'Apple Metal build' + if (t.includes('sycl') || t.includes('intel')) return 'Intel SYCL build' + if (t.includes('vulkan')) return 'Vulkan build' + if (t.includes('darwin-x86')) return 'macOS x86 build' + return 'hardware-specific build' +} + +// suitabilityFor returns the picker's per-row suitability state for the +// requested backend. Already-installed wins over compatible/override so +// the user sees a single signal per row. +function suitabilityFor({ node, backend, hardwareTarget, alreadyInstalled }) { + if (alreadyInstalled) return 'installed' + // backend can be null on the first render before pickerBackend is set — + // this function is invoked from useMemo, which runs regardless of the + // outer open guard. Treat missing data as "compatible" so the placeholder + // render doesn't blow up; the picker won't actually paint anything until + // the early-return below the hooks fires. + if (!backend || backend.isMeta || !hardwareTarget) return 'compatible' + const vendor = (node.gpu_vendor || '').toLowerCase() + const t = hardwareTarget.toLowerCase() + if (t.startsWith('cpu') || t === 'default') { + // CPU builds always run; they're never marked Override (running CPU on a + // GPU node is the headline use case the user is choosing intentionally). + return 'compatible' + } + if (t.includes('nvidia') || t.includes('cuda') || t.includes('l4t')) { + return vendor === 'nvidia' ? 'compatible' : 'override' + } + if (t.includes('amd') || t.includes('rocm') || t.includes('hip')) { + return vendor === 'amd' ? 'compatible' : 'override' + } + if (t.includes('intel') || t.includes('sycl')) { + return vendor === 'intel' ? 'compatible' : 'override' + } + if (t.includes('metal') || t.includes('darwin')) { + // No vendor reporting for Metal; trust the user. + return 'compatible' + } + return 'compatible' +} + +export default function NodeInstallPicker({ + open, onClose, onComplete, + backend, + nodes = [], + allBackends = [], + installedNodeIds = [], + initialSelection, + addToast, +}) { + const [search, setSearch] = useState('') + const [showHealthy, setShowHealthy] = useState(true) + const [showDraining, setShowDraining] = useState(false) + const [selected, setSelected] = useState(() => new Set()) + const [overrideVariant, setOverrideVariant] = useState('') // chosen concrete name + const [overrideExpanded, setOverrideExpanded] = useState(false) + const [submitting, setSubmitting] = useState(false) + const [showMismatchConfirm, setShowMismatchConfirm] = useState(false) + // Per-node submission state: { [nodeId]: { status: 'pending'|'installing'|'done'|'error', error? , version? } } + const [perNode, setPerNode] = useState({}) + const headerInputRef = useRef(null) + + // Backend-derived metadata used throughout the picker. + const hardwareTarget = useMemo(() => hardwareTargetOf(backend, allBackends), [backend, allBackends]) + const targetLabel = humanTargetLabel(hardwareTarget) + const concreteVariants = useMemo(() => { + if (!backend?.isMeta || !backend.capabilities) return [] + return Object.entries(backend.capabilities).map(([cap, concrete]) => ({ + value: concrete, + label: `${concrete} · ${cap}`, + })) + }, [backend]) + + // Pending nodes are surgically removed from the list — they can't accept + // installs until approved. Surface the count instead of dead-disabled rows. + const pendingCount = nodes.filter(n => n.status === 'pending').length + const backendNodes = nodes.filter(n => + (!n.node_type || n.node_type === 'backend') && n.status !== 'pending' + ) + + const installedSet = useMemo(() => { + const s = new Set() + if (Array.isArray(installedNodeIds)) installedNodeIds.forEach(id => s.add(id)) + else if (installedNodeIds && typeof installedNodeIds.has === 'function') { + installedNodeIds.forEach(id => s.add(id)) + } + return s + }, [installedNodeIds]) + + const filteredNodes = useMemo(() => { + let list = backendNodes + if (!showHealthy) list = list.filter(n => n.status !== 'healthy') + if (!showDraining) list = list.filter(n => n.status !== 'draining') + if (search.trim()) { + const q = search.toLowerCase() + list = list.filter(n => + (n.name || '').toLowerCase().includes(q) || + Object.entries(n.labels || {}).some(([k, v]) => `${k}=${v}`.toLowerCase().includes(q)) + ) + } + return list + }, [backendNodes, showHealthy, showDraining, search]) + + // Pre-seed selection on open. Reset all transient state so reopening + // doesn't surface ghost progress from the prior submit. + useEffect(() => { + if (!open) return + const initial = new Set() + if (Array.isArray(initialSelection)) initialSelection.forEach(id => initial.add(id)) + setSelected(initial) + setSearch('') + setOverrideVariant('') + setOverrideExpanded(false) + setPerNode({}) + setSubmitting(false) + setShowMismatchConfirm(false) + }, [open, initialSelection]) + + // Auto-expand the variant override disclosure when at least one selected + // node lacks a working GPU. This is the headline use case the feature + // exists for; surfacing it instead of hiding behind a click. + useEffect(() => { + if (!backend?.isMeta) return + const someGPUMissing = Array.from(selected).some(id => { + const n = backendNodes.find(x => x.id === id) + return n && (!n.gpu_vendor || n.gpu_vendor === '' || n.gpu_vendor === 'unknown') + }) + if (someGPUMissing && !overrideExpanded) setOverrideExpanded(true) + }, [selected, backend, backendNodes]) // eslint-disable-line react-hooks/exhaustive-deps + + // The effective backend that gets installed on each node. For + // hardware-specific backends this is just backend.name. For meta backends + // with no override, the worker picks per-node — we pass backend.name and + // the worker resolves. With an override set, the picker installs that + // exact concrete variant on every selected node. + const effectiveBackendName = overrideVariant || backend?.name + + const counts = useMemo(() => { + let already = 0, overrides = 0 + selected.forEach(id => { + const n = backendNodes.find(x => x.id === id) + if (!n) return + if (installedSet.has(id)) { already++; return } + const eff = overrideVariant + ? { name: overrideVariant, isMeta: false, metaBackendFor: backend?.name } + : backend + const target = overrideVariant ? hardwareTargetOf(eff, allBackends) : hardwareTarget + const s = suitabilityFor({ node: n, backend: eff, hardwareTarget: target, alreadyInstalled: false }) + if (s === 'override') overrides++ + }) + return { already, overrides, selected: selected.size } + }, [selected, backendNodes, installedSet, overrideVariant, backend, hardwareTarget, allBackends]) + + const toggle = (nodeId) => { + setSelected(prev => { + const next = new Set(prev) + next.has(nodeId) ? next.delete(nodeId) : next.add(nodeId) + return next + }) + } + + const selectAllHealthy = () => { + setSelected(new Set(filteredNodes.filter(n => n.status === 'healthy').map(n => n.id))) + } + const selectCompatible = () => { + const eff = overrideVariant + ? { name: overrideVariant, isMeta: false, metaBackendFor: backend?.name } + : backend + const target = overrideVariant ? hardwareTargetOf(eff, allBackends) : hardwareTarget + setSelected(new Set( + filteredNodes + .filter(n => suitabilityFor({ node: n, backend: eff, hardwareTarget: target, alreadyInstalled: false }) === 'compatible') + .map(n => n.id) + )) + } + const clearSelection = () => setSelected(new Set()) + + const submit = async () => { + if (selected.size === 0 || submitting) return + if (counts.overrides > 0 && !showMismatchConfirm) { + setShowMismatchConfirm(true) + return + } + setShowMismatchConfirm(false) + setSubmitting(true) + const ids = Array.from(selected) + setPerNode(prev => { + const next = { ...prev } + ids.forEach(id => { next[id] = { status: 'installing' } }) + return next + }) + + const results = await Promise.allSettled(ids.map(id => + nodesApi.installBackend(id, effectiveBackendName) + .then(r => ({ id, ok: true, message: r?.message })) + .catch(err => ({ id, ok: false, error: err?.message || 'install failed' })) + )) + + let successCount = 0, failCount = 0 + setPerNode(prev => { + const next = { ...prev } + for (const r of results) { + if (r.status !== 'fulfilled') continue + const v = r.value + if (v.ok) { + next[v.id] = { status: 'done' } + successCount++ + } else { + next[v.id] = { status: 'error', error: v.error } + failCount++ + } + } + return next + }) + setSubmitting(false) + + if (successCount > 0 && onComplete) onComplete() + + if (failCount === 0) { + addToast?.(`Installed on ${successCount} node${successCount === 1 ? '' : 's'}`, 'success') + setTimeout(() => onClose?.(), 800) + } else if (successCount === 0) { + addToast?.(`Install failed on all ${failCount} node${failCount === 1 ? '' : 's'}`, 'error') + } else { + addToast?.(`Installed on ${successCount}, failed on ${failCount}`, 'warning') + } + } + + const retryFailed = async () => { + const failedIds = Object.entries(perNode) + .filter(([, v]) => v.status === 'error') + .map(([id]) => id) + if (failedIds.length === 0) return + setSelected(new Set(failedIds)) + // Replace state for failed rows so they show "installing" again, not stale errors. + setPerNode(prev => { + const next = { ...prev } + failedIds.forEach(id => { next[id] = { status: 'installing' } }) + return next + }) + setSubmitting(true) + const results = await Promise.allSettled(failedIds.map(id => + nodesApi.installBackend(id, effectiveBackendName) + .then(r => ({ id, ok: true, message: r?.message })) + .catch(err => ({ id, ok: false, error: err?.message || 'install failed' })) + )) + let successCount = 0, failCount = 0 + setPerNode(prev => { + const next = { ...prev } + for (const r of results) { + if (r.status !== 'fulfilled') continue + const v = r.value + if (v.ok) { next[v.id] = { status: 'done' }; successCount++ } + else { next[v.id] = { status: 'error', error: v.error }; failCount++ } + } + return next + }) + setSubmitting(false) + if (successCount > 0 && onComplete) onComplete() + if (failCount === 0) { + addToast?.(`Installed on ${successCount} node${successCount === 1 ? '' : 's'}`, 'success') + setTimeout(() => onClose?.(), 800) + } + } + + const doneCount = Object.values(perNode).filter(v => v.status === 'done').length + const errorCount = Object.values(perNode).filter(v => v.status === 'error').length + const totalAttempted = Object.keys(perNode).length + + if (!open || !backend) return null + + const noNodes = backendNodes.length === 0 + + return ( + +
+

+ + Install {backend.name} + {backend.isMeta ? ( + Auto-resolving + ) : ( + Hardware-specific + )} +

+ +
+ +
+ {!backend.isMeta && ( +
+ + + {targetLabel}. Install only on nodes where you want this build to run. + {hardwareTarget && ` Targets: ${humanTargetLabel(hardwareTarget).replace(' build', '')}.`} + +
+ )} + + {noNodes ? ( +
+
+

No backend nodes available

+

+ Approve pending workers or register new ones. + {pendingCount > 0 && ` (${pendingCount} awaiting approval.)`} +

+ + Manage nodes + +
+ ) : ( + <> + {/* Filter row */} +
+
+ + setSearch(e.target.value)} + /> +
+ + + {selected.size > 0 && ( + + )} +
+ + {/* Variant override (auto-resolving only) */} + {backend.isMeta && concreteVariants.length > 0 && ( +
+ + {overrideExpanded && ( +
+

+ By default each node picks its own variant. Override to install one specific variant on every selected node — useful when GPU detection fails on a node and you want the CPU build there instead. +

+ +
+ )} +
+ )} + + {/* Node table */} +
+ + + + + + + + + + + + {filteredNodes.map(node => { + const installed = installedSet.has(node.id) + const eff = overrideVariant + ? { name: overrideVariant, isMeta: false, metaBackendFor: backend.name } + : backend + const target = overrideVariant ? hardwareTargetOf(eff, allBackends) : hardwareTarget + const suit = suitabilityFor({ node, backend: eff, hardwareTarget: target, alreadyInstalled: installed }) + const isSel = selected.has(node.id) + const rowState = perNode[node.id] + const vendor = gpuVendorLabel(node.gpu_vendor) + const totalVRAM = formatVRAM(node.total_vram) + const totalRAM = formatVRAM(node.total_ram) + return ( + + + + + + + + ) + })} + {filteredNodes.length === 0 && ( + + + + )} + +
+ 0 && filteredNodes.every(n => selected.has(n.id))} + onChange={(e) => { + setSelected(prev => { + const next = new Set(prev) + if (e.target.checked) filteredNodes.forEach(n => next.add(n.id)) + else filteredNodes.forEach(n => next.delete(n.id)) + return next + }) + }} + /> + NodeStatusHardwareSuitability
+ toggle(node.id)} + /> + +
+ {node.name} + {node.labels && Object.keys(node.labels).length > 0 && ( +
+ {Object.entries(node.labels).slice(0, 3).map(([k, v]) => ( + {k}={v} + ))} + {Object.keys(node.labels).length > 3 && ( + + +{Object.keys(node.labels).length - 3} + + )} +
+ )} +
+
+ + {STATUS_LABELS[node.status] || node.status} + + + {totalVRAM ? ( + <>{vendor && {vendor}}{totalVRAM} + ) : totalRAM ? ( + CPU · {totalRAM} + ) : } + + {rowState?.status === 'installing' ? ( + + Installing + + ) : rowState?.status === 'done' ? ( + + Installed + + ) : rowState?.status === 'error' ? ( + + ) : suit === 'installed' ? ( + + Installed + + ) : suit === 'override' ? ( + + Override + + ) : ( + + Compatible + + )} +
+ No nodes match the current filters. +
+
+ + {pendingCount > 0 && ( +

+ +{pendingCount} awaiting approval — approve from Nodes. +

+ )} + + {/* Mismatch confirm */} + {showMismatchConfirm && ( +
+

+ Installing {targetLabel.toLowerCase()} on {counts.overrides} node{counts.overrides === 1 ? '' : 's'} that don't match. Those nodes will run inference on the chosen build, not their native GPU. Continue? +

+
+ + +
+
+ )} + + )} +
+ + {!noNodes && ( +
+
+ {totalAttempted > 0 ? ( + <> + {doneCount} of {totalAttempted} done + {errorCount > 0 && ( + <> · {errorCount} failed + )} + + ) : ( + <> + {counts.selected} {counts.selected === 1 ? 'node' : 'nodes'} selected + {counts.already > 0 && <> · {counts.already} already installed} + {counts.overrides > 0 && <> · {counts.overrides} override{counts.overrides === 1 ? '' : 's'}} + + )} +
+ {errorCount > 0 && !submitting && ( + + )} + + +
+ )} +
+ ) +} diff --git a/core/http/react-ui/src/hooks/useDistributedMode.js b/core/http/react-ui/src/hooks/useDistributedMode.js new file mode 100644 index 000000000..49ad6eace --- /dev/null +++ b/core/http/react-ui/src/hooks/useDistributedMode.js @@ -0,0 +1,40 @@ +import { useState, useEffect, useCallback } from 'react' +import { nodesApi } from '../utils/api' + +// useDistributedMode probes /api/nodes to decide whether the running LocalAI +// is in distributed mode. The endpoint returns 503 when distributed mode is +// disabled — we treat any failure as standalone, mirroring the detection +// pattern in pages/Nodes.jsx so UI behaviour matches the Nodes page. +// +// Returns: +// enabled — true when the cluster API answered OK at least once +// nodes — the most recent /api/nodes response (array; possibly empty) +// loading — true until the first probe completes +// refetch — manual trigger; the picker calls this after install/delete +// +// Components that need a live nodes list (e.g. install picker) re-call +// refetch after operations complete. The hook does not poll on its own — +// the Nodes page handles its own 5s polling and the Backends gallery only +// needs a one-shot read on mount. +export function useDistributedMode() { + const [enabled, setEnabled] = useState(false) + const [nodes, setNodes] = useState([]) + const [loading, setLoading] = useState(true) + + const probe = useCallback(async () => { + try { + const data = await nodesApi.list() + setNodes(Array.isArray(data) ? data : []) + setEnabled(true) + } catch { + setEnabled(false) + setNodes([]) + } finally { + setLoading(false) + } + }, []) + + useEffect(() => { probe() }, [probe]) + + return { enabled, nodes, loading, refetch: probe } +} diff --git a/core/http/react-ui/src/pages/Backends.jsx b/core/http/react-ui/src/pages/Backends.jsx index ea34ffdbb..a8ef1ac98 100644 --- a/core/http/react-ui/src/pages/Backends.jsx +++ b/core/http/react-ui/src/pages/Backends.jsx @@ -1,18 +1,24 @@ -import { useState, useEffect, useCallback } from 'react' -import { useNavigate, useOutletContext } from 'react-router-dom' -import { backendsApi } from '../utils/api' +import { useState, useEffect, useCallback, useRef } from 'react' +import { useNavigate, useOutletContext, useSearchParams } from 'react-router-dom' +import { backendsApi, nodesApi } from '../utils/api' import { useDebouncedCallback } from '../hooks/useDebounce' import React from 'react' import { useOperations } from '../hooks/useOperations' +import { useDistributedMode } from '../hooks/useDistributedMode' import LoadingSpinner from '../components/LoadingSpinner' import { renderMarkdown } from '../utils/markdown' import ConfirmDialog from '../components/ConfirmDialog' import Toggle from '../components/Toggle' +import NodeDistributionChip from '../components/NodeDistributionChip' +import NodeInstallPicker from '../components/NodeInstallPicker' +import Popover from '../components/Popover' export default function Backends() { const { addToast } = useOutletContext() const navigate = useNavigate() + const [searchParams, setSearchParams] = useSearchParams() const { operations } = useOperations() + const { enabled: distributedEnabled, nodes: clusterNodes, refetch: refetchNodes } = useDistributedMode() const [loading, setLoading] = useState(true) const [search, setSearch] = useState('') const [filter, setFilter] = useState('') @@ -32,6 +38,31 @@ export default function Backends() { const [showAllBackends, setShowAllBackends] = useState(false) const [showDevelopment, setShowDevelopment] = useState(false) const [preferDevLoaded, setPreferDevLoaded] = useState(false) + const [pickerBackend, setPickerBackend] = useState(null) + const [pickerInitialSelection, setPickerInitialSelection] = useState([]) + const [splitMenuFor, setSplitMenuFor] = useState(null) + // Anchor ref for the currently-open split-button chevron. Only one row's + // menu can be open at a time, so a single ref is enough — re-attached + // whenever splitMenuFor changes to a different row index. + const splitMenuAnchorRef = useRef(null) + + // Target-node mode: set when navigated from /app/nodes via "+ Add backend". + // The gallery page header banners the scope; rows collapse their split-button + // to a single Install-on-this-node action; manual install posts to the + // per-node endpoint. + const targetNodeId = searchParams.get('target') || '' + const targetNode = targetNodeId + ? clusterNodes.find(n => n.id === targetNodeId) || null + : null + + const clearTarget = useCallback(() => { + const next = new URLSearchParams(searchParams) + next.delete('target') + setSearchParams(next, { replace: true }) + }, [searchParams, setSearchParams]) + + // The Popover component handles outside-click + Escape + focus return, + // so we don't reimplement it here. const fetchBackends = useCallback(async () => { try { @@ -127,10 +158,54 @@ export default function Backends() { try { await backendsApi.install(id) } catch (err) { + // Distributed-mode 409 guard: surface the human message and steer the + // user to the picker rather than failing silently. The error body has + // a `code` field of "concrete_backend_requires_target". + const isConcreteGuard = err?.payload?.code === 'concrete_backend_requires_target' + || (err?.message || '').includes('hardware-specific build') + if (isConcreteGuard && distributedEnabled) { + const b = allBackends.find(x => x.id === id || x.name === id) + if (b) { + openPicker(b) + return + } + } addToast(`Install failed: ${err.message}`, 'error') } } + // Install a single gallery backend on a specific node, used in target-node + // mode (the URL has ?target= set from the Nodes page entry point). + const handleInstallOnTarget = async (id) => { + if (!targetNode) return + try { + await nodesApi.installBackend(targetNode.id, id) + addToast(`Installing ${id} on ${targetNode.name}…`, 'info') + // Per-node install is request-reply, not part of the global jobs feed — + // refetch to reflect the new Nodes column state. + setTimeout(() => { fetchBackends(); refetchNodes() }, 600) + } catch (err) { + addToast(`Install failed on ${targetNode.name}: ${err.message}`, 'error') + } + } + + const openPicker = (b, initialSelection = []) => { + setPickerBackend(b) + setPickerInitialSelection(initialSelection) + setSplitMenuFor(null) + } + + // Returns the IDs of nodes that don't yet have this backend installed. + // Used by the Nodes column "+" affordance to pre-select missing nodes. + const missingNodesFor = (b) => { + const installed = new Set((b?.nodes || []).map(n => n.node_id ?? n.NodeID)) + return clusterNodes + .filter(n => (!n.node_type || n.node_type === 'backend') + && n.status === 'healthy' + && !installed.has(n.id)) + .map(n => n.id) + } + const handleDelete = async (id) => { setConfirmDialog({ title: 'Delete Backend', @@ -179,10 +254,26 @@ export default function Backends() { e.preventDefault() if (!manualUri.trim()) { addToast('Please enter a URI', 'warning'); return } try { - const body = { uri: manualUri.trim() } - if (manualName.trim()) body.name = manualName.trim() - if (manualAlias.trim()) body.alias = manualAlias.trim() - await backendsApi.installExternal(body) + if (targetNode) { + // Target-node mode: route the manual install to the per-node endpoint + // so the backend lands only on this worker, not the whole cluster. + await nodesApi.installBackend( + targetNode.id, + manualName.trim() || '', + { + uri: manualUri.trim(), + name: manualName.trim() || undefined, + alias: manualAlias.trim() || undefined, + }, + ) + addToast(`Installing on ${targetNode.name}…`, 'info') + setTimeout(() => { fetchBackends(); refetchNodes() }, 600) + } else { + const body = { uri: manualUri.trim() } + if (manualName.trim()) body.name = manualName.trim() + if (manualAlias.trim()) body.alias = manualAlias.trim() + await backendsApi.installExternal(body) + } setManualUri('') setManualName('') setManualAlias('') @@ -225,6 +316,31 @@ export default function Backends() { return (
+ {/* Target-node banner: when this gallery is scoped to one node via + ?target= (entered from /app/nodes), show the scope clearly and + give a fast way to clear it. Visually a primary-tinted strip so the + user knows they're in a special mode without it feeling alarming. */} + {targetNode && ( +
+ + + Installing only on {targetNode.name} + + + +
+ )} + {/* Header */}
@@ -377,6 +493,7 @@ export default function Backends() { Repository License Status + {distributedEnabled && !targetNode && Nodes} Actions @@ -446,7 +563,10 @@ export default function Backends() { ) : '-'} - {/* Status */} + {/* Status — in distributed mode the Nodes column is the + installed signal, so we drop the global "Installed" + badge here and only keep operation-progress / update + signals to avoid stacking 6 badges in one cell. */} {isProcessing ? (
@@ -464,9 +584,16 @@ export default function Backends() {
) : b.installed ? (
- - Installed - + {!distributedEnabled && ( + + Installed + + )} + {b.version && ( + + v{b.version} + + )} {upgrades[b.name] && ( @@ -481,10 +608,67 @@ export default function Backends() { )} + {/* Nodes column (distributed mode only, hidden in target + mode since it's redundant with the banner). The chip + is read-only inspection; the adjacent + button is the + write affordance — keeping them visually separate so + users don't accidentally trigger the picker by clicking + to read distribution. */} + {distributedEnabled && !targetNode && ( + +
+ + {(() => { + const missing = missingNodesFor(b) + if (missing.length === 0 || isProcessing) return null + return ( + + ) + })()} +
+ + )} + {/* Actions */}
e.stopPropagation()}> - {b.installed ? ( + {targetNode ? ( + // Target-node mode: collapse to a single per-node + // action. The split-button is overkill when scope is + // already pinned by the URL. + (b.nodes || []).some(n => (n.node_id ?? n.NodeID) === targetNode.id) ? ( + <> + + + + ) : ( + + ) + ) : b.installed ? ( <> {upgrades[b.name] ? ( + ) : distributedEnabled ? ( + // Split-button. Auto-resolving (meta) keeps fan-out + // as the primary; hardware-specific routes the + // primary directly to the picker — fan-out for a + // CPU build is the silent footgun this guard exists + // to prevent. Both share a chevron menu for the + // alternate path. + b.isMeta ? ( +
+ + +
+ ) : ( + + ) ) : ( +
+ + + { setPickerBackend(null); setPickerInitialSelection([]) }} + onComplete={() => { fetchBackends(); refetchNodes() }} + backend={pickerBackend} + nodes={clusterNodes} + allBackends={allBackends} + installedNodeIds={(pickerBackend?.nodes || []).map(n => n.node_id ?? n.NodeID)} + initialSelection={pickerInitialSelection} + addToast={addToast} + />
) } diff --git a/core/http/react-ui/src/pages/Nodes.jsx b/core/http/react-ui/src/pages/Nodes.jsx index 0547ed789..8a86d28f2 100644 --- a/core/http/react-ui/src/pages/Nodes.jsx +++ b/core/http/react-ui/src/pages/Nodes.jsx @@ -845,10 +845,30 @@ export default function Nodes() { )} -

- - Installed Backends -

+
+

+ + Installed Backends +

+ +
{!backends ? ( ) : backends.length === 0 ? ( diff --git a/core/http/react-ui/src/utils/api.js b/core/http/react-ui/src/utils/api.js index b711b0593..dd3dca0c3 100644 --- a/core/http/react-ui/src/utils/api.js +++ b/core/http/react-ui/src/utils/api.js @@ -463,7 +463,17 @@ export const nodesApi = { approve: (id) => postJSON(API_CONFIG.endpoints.nodeApprove(id), {}), getModels: (id) => fetchJSON(API_CONFIG.endpoints.nodeModels(id)), getBackends: (id) => fetchJSON(API_CONFIG.endpoints.nodeBackends(id)), - installBackend: (id, backend) => postJSON(API_CONFIG.endpoints.nodeBackendsInstall(id), { backend }), + // installBackend installs a gallery backend on a single node. opts can + // override the gallery path and supply a direct URI (OCI image / URL / file + // path) plus an optional name+alias, mirroring the standalone /backends/ + // install-external surface but scoped to one node. + installBackend: (id, backend, opts = {}) => postJSON(API_CONFIG.endpoints.nodeBackendsInstall(id), { + backend, + ...(opts.uri ? { uri: opts.uri } : {}), + ...(opts.name ? { name: opts.name } : {}), + ...(opts.alias ? { alias: opts.alias } : {}), + ...(opts.backend_galleries ? { backend_galleries: opts.backend_galleries } : {}), + }), deleteBackend: (id, backend) => postJSON(API_CONFIG.endpoints.nodeBackendsDelete(id), { backend }), getBackendLogs: (id) => fetchJSON(API_CONFIG.endpoints.nodeBackendLogs(id)), getBackendLogLines: (id, modelId) => fetchJSON(API_CONFIG.endpoints.nodeBackendLogsModel(id, modelId)), diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go index 59b7c0f93..8ffc65a71 100644 --- a/core/http/routes/localai.go +++ b/core/http/routes/localai.go @@ -61,7 +61,7 @@ func RegisterLocalAIRoutes(router *echo.Echo, appConfig.SystemState, galleryService, app.UpgradeChecker()) - router.POST("/backends/apply", backendGalleryEndpointService.ApplyBackendEndpoint(), adminMiddleware) + router.POST("/backends/apply", backendGalleryEndpointService.ApplyBackendEndpoint(appConfig.SystemState), adminMiddleware) router.POST("/backends/delete/:name", backendGalleryEndpointService.DeleteBackendEndpoint(), adminMiddleware) router.GET("/backends", backendGalleryEndpointService.ListBackendsEndpoint(), adminMiddleware) router.GET("/backends/available", backendGalleryEndpointService.ListAvailableBackendsEndpoint(appConfig.SystemState), adminMiddleware) diff --git a/core/http/routes/ui_api.go b/core/http/routes/ui_api.go index 4c76a5906..e04fc4c2d 100644 --- a/core/http/routes/ui_api.go +++ b/core/http/routes/ui_api.go @@ -40,6 +40,25 @@ const ( ) // getDirectorySize calculates the total size of files in a directory +// metaParentOf returns the name of the auto-resolving (meta) backend that +// declares `name` as one of its hardware-specific variants in its +// CapabilitiesMap, or "" if there is no such parent. The install picker uses +// this to render hints like "CPU build of llama-cpp" without re-walking the +// whole gallery on the client side. +func metaParentOf(name string, backends gallery.GalleryElements[*gallery.GalleryBackend]) string { + for _, b := range backends { + if !b.IsMeta() { + continue + } + for _, concreteName := range b.CapabilitiesMap { + if concreteName == name { + return b.Name + } + } + } + return "" +} + func getDirectorySize(path string) (int64, error) { var totalSize int64 entries, err := os.ReadDir(path) @@ -998,23 +1017,37 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model } } + // Per-node distribution + parent meta lookup for the install picker. + // `nodes` populates the Nodes column on the gallery; `metaBackendFor` + // lets the picker name the parent (e.g. "CPU build of llama-cpp") + // without re-walking the whole gallery on the client. + var perNode []gallery.NodeBackendRef + if installedBackends != nil { + if sb, ok := installedBackends.Get(b.Name); ok { + perNode = sb.Nodes + } + } + backendsJSON = append(backendsJSON, map[string]any{ - "id": backendID, - "name": b.Name, - "description": b.Description, - "icon": b.Icon, - "license": b.License, - "urls": b.URLs, - "tags": b.Tags, - "gallery": b.Gallery.Name, - "installed": b.Installed, - "version": b.Version, - "processing": currentlyProcessing, - "jobID": jobID, - "isDeletion": isDeletionOp, - "isMeta": b.IsMeta(), - "isAlias": aliasedByMeta[b.Name], - "isDevelopment": b.IsDevelopment(devSuffix), + "id": backendID, + "name": b.Name, + "description": b.Description, + "icon": b.Icon, + "license": b.License, + "urls": b.URLs, + "tags": b.Tags, + "gallery": b.Gallery.Name, + "installed": b.Installed, + "version": b.Version, + "processing": currentlyProcessing, + "jobID": jobID, + "isDeletion": isDeletionOp, + "isMeta": b.IsMeta(), + "isAlias": aliasedByMeta[b.Name], + "isDevelopment": b.IsDevelopment(devSuffix), + "capabilities": b.CapabilitiesMap, + "metaBackendFor": metaParentOf(b.Name, backends), + "nodes": perNode, }) } diff --git a/core/services/galleryop/managers.go b/core/services/galleryop/managers.go index 5adbef8c7..d1ad9ef0c 100644 --- a/core/services/galleryop/managers.go +++ b/core/services/galleryop/managers.go @@ -22,4 +22,9 @@ type BackendManager interface { ListBackends() (gallery.SystemBackends, error) UpgradeBackend(ctx context.Context, name string, progressCb ProgressCallback) error CheckUpgrades(ctx context.Context) (map[string]gallery.UpgradeInfo, error) + // IsDistributed reports whether installs fan out across worker nodes. + // The HTTP layer uses this to refuse hardware-specific (non-meta) installs + // on /api/backends/apply in distributed mode — a CPU build silently + // landing on every GPU node is the footgun this guards against. + IsDistributed() bool } diff --git a/core/services/galleryop/managers_local.go b/core/services/galleryop/managers_local.go index 2d86f2dc6..a73bb1ff9 100644 --- a/core/services/galleryop/managers_local.go +++ b/core/services/galleryop/managers_local.go @@ -108,3 +108,5 @@ func (b *LocalBackendManager) InstallBackend(ctx context.Context, op *Management return gallery.InstallBackendFromGallery(ctx, b.backendGalleries, b.systemState, b.modelLoader, op.GalleryElementName, progressCb, true) } + +func (b *LocalBackendManager) IsDistributed() bool { return false } diff --git a/core/services/nodes/managers_distributed.go b/core/services/nodes/managers_distributed.go index 54aa3ffc7..e18a8d136 100644 --- a/core/services/nodes/managers_distributed.go +++ b/core/services/nodes/managers_distributed.go @@ -364,6 +364,11 @@ func (d *DistributedBackendManager) UpgradeBackend(ctx context.Context, name str return result.Err() } +// IsDistributed reports that installs from this manager fan out across the +// cluster. The HTTP layer reads this to gate hardware-specific installs on +// /api/backends/apply (which would otherwise silently land on every node). +func (d *DistributedBackendManager) IsDistributed() bool { return true } + // CheckUpgrades checks for available backend upgrades across the cluster. // // The previous implementation delegated to d.local, which called diff --git a/core/services/nodes/managers_distributed_test.go b/core/services/nodes/managers_distributed_test.go index 968a932b2..db390aa6c 100644 --- a/core/services/nodes/managers_distributed_test.go +++ b/core/services/nodes/managers_distributed_test.go @@ -108,6 +108,7 @@ func (stubLocalBackendManager) UpgradeBackend(_ context.Context, _ string, _ gal func (stubLocalBackendManager) CheckUpgrades(_ context.Context) (map[string]gallery.UpgradeInfo, error) { return nil, nil } +func (stubLocalBackendManager) IsDistributed() bool { return false } var _ = Describe("DistributedBackendManager", func() { var ( diff --git a/docker-compose.distributed.yaml b/docker-compose.distributed.yaml index b8a081f1a..9b66d0d5f 100644 --- a/docker-compose.distributed.yaml +++ b/docker-compose.distributed.yaml @@ -60,6 +60,13 @@ services: # Auth (required for distributed mode — must use PostgreSQL) LOCALAI_AUTH: "true" LOCALAI_AUTH_DATABASE_URL: "postgresql://localai:localai@postgres:5432/localai?sslmode=disable" + # Force pure-Go DNS resolver. The default cgo resolver follows the + # container's nsswitch.conf and ends up forwarding to host + # systemd-resolved (127.0.0.53), which isn't reachable from inside + # the container — failing every postgres/nats hostname lookup at + # boot. The pure-Go path reads /etc/resolv.conf directly and uses + # Docker's embedded DNS at 127.0.0.11. + GODEBUG: "netdns=go" # Paths MODELS_PATH: /models volumes: @@ -99,6 +106,7 @@ services: LOCALAI_REGISTRATION_TOKEN: "changeme" # Must match frontend token LOCALAI_HEARTBEAT_INTERVAL: "10s" LOCALAI_NATS_URL: "nats://nats:4222" + GODEBUG: "netdns=go" # See note in localai service MODELS_PATH: /models volumes: - worker_1_models:/models @@ -184,6 +192,7 @@ services: LOCALAI_REGISTER_TO: "http://localai:8080" LOCALAI_NODE_NAME: "agent-worker-1" LOCALAI_REGISTRATION_TOKEN: "changeme" # Must match frontend token + GODEBUG: "netdns=go" # See note in localai service volumes: - /var/run/docker.sock:/var/run/docker.sock depends_on: