From 551ebdb57a098dc6e25aa52d37d432b50050e3f6 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 24 Apr 2026 22:02:23 +0200 Subject: [PATCH] fix(distributed): correct VRAM/RAM reporting on NVIDIA unified-memory hosts (#9545) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workers on NVIDIA unified-memory hardware (DGX Spark / GB10, Jetson AGX Thor, Jetson Orin/Xavier/Nano) were reporting `available_vram=0` back to the frontend, so the Nodes UI showed the node as fully used even when most of the unified memory was actually free. Three causes addressed: * `isTegraDevice` only matched `/sys/devices/soc0/family == "Tegra"`. DGX Spark (SBSA) reports JEDEC codes there instead — `jep106:0426` for the NVIDIA manufacturer — so the Tegra/unified-memory fallback never ran. Renamed to `isNVIDIAIntegratedGPU` and extended to also match `jep106:0426[:*]` via `/sys/devices/soc0/soc_id`. * The unified-iGPU code defaulted the device name to `"NVIDIA Jetson"` when `/proc/device-tree/model` was missing. That's what happens for Thor inside a docker container, and always on DGX Spark. New `nvidiaIntegratedGPUName` resolves via dt-model → `/sys/devices/soc0/machine` → `soc_id` lookup (`jep106:0426:8901` → `"NVIDIA GB10"`) so the Nodes UI labels the box correctly. * Worker heartbeat sent `available_vram=0` (or total-as-available) when VRAM usage was momentarily unknown — e.g. when `nvidia-smi` intermittently failed with `waitid: no child processes` under containers without `--init`. Each such heartbeat overwrote the DB and made the UI flip to "fully used". `heartbeatBody` now omits `available_vram` in that case so the DB keeps its last good value. Also updates the commented GPU blocks in both compose files with `NVIDIA_DRIVER_CAPABILITIES=compute,utility`, `capabilities: [gpu, utility]`, and `init: true`, and documents the requirement in the distributed-mode and nvidia-l4t pages. Without `utility`, NVML/`nvidia-smi` are absent inside the container, which is what put the DGX Spark worker into the buggy fallback in the first place. Detection verified on live hardware (dgx.casa / GB10 and 192.168.68.23 / Thor) by running a cross-compiled probe of the new helpers on both host and inside the worker container. Assisted-by: Claude:opus-4.7 [Claude Code] --- core/cli/worker.go | 20 ++--- docker-compose.distributed.yaml | 12 ++- docker-compose.yaml | 15 +++- docs/content/features/distributed-mode.md | 23 +++++ docs/content/reference/nvidia-l4t.md | 22 +++++ pkg/xsysinfo/gpu.go | 102 +++++++++++++++------- 6 files changed, 149 insertions(+), 45 deletions(-) diff --git a/core/cli/worker.go b/core/cli/worker.go index ecc53bb72..63591c0d7 100644 --- a/core/cli/worker.go +++ b/core/cli/worker.go @@ -924,21 +924,21 @@ func (cmd *WorkerCMD) registrationBody() map[string]any { } // heartbeatBody returns the current VRAM/RAM stats for heartbeat payloads. +// +// When aggregate VRAM usage is unknown (no GPU, or temporary detection +// failure), we deliberately OMIT available_vram so the frontend keeps its +// last good value — overwriting with 0 makes the UI show the node as "fully +// used", while reporting total-as-available lies to the scheduler about +// free capacity. func (cmd *WorkerCMD) heartbeatBody() map[string]any { - var availVRAM uint64 + body := map[string]any{} aggregate := xsysinfo.GetGPUAggregateInfo() if aggregate.TotalVRAM > 0 { - availVRAM = aggregate.FreeVRAM - } else { - // Fallback: report total as available (no usage tracking possible) - availVRAM, _ = xsysinfo.TotalAvailableVRAM() + body["available_vram"] = aggregate.FreeVRAM } - body := map[string]any{ - "available_vram": availVRAM, - } - - // If no GPU, report system RAM usage instead + // CPU-only workers (or workers that lost GPU visibility momentarily): + // report system RAM so the scheduler still has capacity info. if aggregate.TotalVRAM == 0 { if ramInfo, err := xsysinfo.GetSystemRAMInfo(); err == nil { body["available_ram"] = ramInfo.Available diff --git a/docker-compose.distributed.yaml b/docker-compose.distributed.yaml index 04223c4c6..b8a081f1a 100644 --- a/docker-compose.distributed.yaml +++ b/docker-compose.distributed.yaml @@ -110,15 +110,23 @@ services: # --- GPU Support (NVIDIA) --- # Uncomment the following and change the image to a CUDA variant - # (e.g., localai/localai:latest-gpu-nvidia-cuda-12) to enable GPU: + # (e.g., localai/localai:latest-gpu-nvidia-cuda-12) to enable GPU. # + # NVIDIA_DRIVER_CAPABILITIES must include `utility` so nvidia-smi / NVML + # are available inside the container; without it the worker cannot report + # free VRAM and the Nodes page will show 0 free / total used. + # `init: true` avoids zombie-reap races that make nvidia-smi flaky. + # + # init: true + # environment: + # NVIDIA_DRIVER_CAPABILITIES: "compute,utility" # deploy: # resources: # reservations: # devices: # - driver: nvidia.com/gpu # count: all - # capabilities: [gpu] + # capabilities: [gpu, utility] # --- Shared Volume Mode (optional) --- # If all services run on the same Docker host, you can skip gRPC file transfer diff --git a/docker-compose.yaml b/docker-compose.yaml index 7897509d6..a432a699a 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -40,23 +40,32 @@ services: # - https://gist.githubusercontent.com/mudler/ad601a0488b497b69ec549150d9edd18/raw/a8a8869ef1bb7e3830bf5c0bae29a0cce991ff8d/phi-2.yaml - phi-2 # For NVIDIA GPU support with CDI (recommended for NVIDIA Container Toolkit 1.14+): - # Uncomment the following deploy section and use driver: nvidia.com/gpu + # Uncomment the following deploy section and use driver: nvidia.com/gpu. + # Include `utility` in capabilities so nvidia-smi / NVML are available — + # without it, free-VRAM reporting on discrete GPUs is unavailable and the + # Nodes UI will misreport memory usage. + # environment: + # NVIDIA_DRIVER_CAPABILITIES: "compute,utility" + # init: true # avoids zombie-reap races that can make nvidia-smi flaky # deploy: # resources: # reservations: # devices: # - driver: nvidia.com/gpu # count: all - # capabilities: [gpu] + # capabilities: [gpu, utility] # # For legacy NVIDIA driver (for older NVIDIA Container Toolkit): + # environment: + # NVIDIA_DRIVER_CAPABILITIES: "compute,utility" + # init: true # deploy: # resources: # reservations: # devices: # - driver: nvidia # count: 1 - # capabilities: [gpu] + # capabilities: [gpu, utility] ## Uncomment for PostgreSQL-backed knowledge base (see Agents docs) # postgres: diff --git a/docs/content/features/distributed-mode.md b/docs/content/features/distributed-mode.md index 79dad2605..773aa5284 100644 --- a/docs/content/features/distributed-mode.md +++ b/docs/content/features/distributed-mode.md @@ -160,6 +160,29 @@ For advanced networking scenarios (NAT, load balancers, separate gRPC/HTTP ports | `LOCALAI_ADVERTISE_ADDR` | Public gRPC address (if different from `LOCALAI_ADDR`) | Derived from `LOCALAI_ADDR` | | `LOCALAI_ADVERTISE_HTTP_ADDR` | Public HTTP address (if different from gRPC host) | Derived from advertise host + HTTP port | +### NVIDIA GPU support + +When running workers in a container, two runtime settings affect how VRAM +usage is reported back to the frontend: + +- **`NVIDIA_DRIVER_CAPABILITIES` must include `utility`.** Without it, the + NVML library (and therefore `nvidia-smi`) is not available inside the + container. CUDA compute still works, but the worker cannot query free VRAM + and the Nodes page will show the node as fully used. Set + `NVIDIA_DRIVER_CAPABILITIES=compute,utility` (or, with the NVIDIA CDI + runtime, list `capabilities: [gpu, utility]` on the device reservation). + +- **Run the container with `init: true` (or `docker run --init`).** The + worker process becomes PID 1 in the container and cannot reap zombies on + its own. Without an init, `nvidia-smi` calls can fail intermittently with + `waitid: no child processes`, which briefly clears free-VRAM metrics. + +**Unified memory devices (Jetson, DGX Spark / GB10, Thor):** these SoCs +share one physical RAM between CPU and GPU. LocalAI detects them via +`/sys/devices/soc0/family` and `/sys/devices/soc0/soc_id` (no `nvidia-smi` +required) and reports system-RAM figures as VRAM. Free VRAM therefore tracks +`MemAvailable` in `/proc/meminfo`. + ### Node Labels Workers can declare labels at startup for scheduling constraints: diff --git a/docs/content/reference/nvidia-l4t.md b/docs/content/reference/nvidia-l4t.md index 9cc81c09b..1edbe3aeb 100644 --- a/docs/content/reference/nvidia-l4t.md +++ b/docs/content/reference/nvidia-l4t.md @@ -78,3 +78,25 @@ docker run -e DEBUG=true -p 8080:8080 -v /data/models:/models -ti --restart=alwa ``` Note: `/data/models` is the directory containing the models. You can replace it with the directory containing your models. + +## GPU reporting in distributed mode + +If you run a worker on a Jetson, DGX Spark (GB10), or Thor and the Nodes +page in the frontend shows the node as fully used, check two things: + +1. `NVIDIA_DRIVER_CAPABILITIES` must include `utility` so `nvidia-smi` / + NVML work inside the container. With `--gpus all` alone (or + `--runtime nvidia` without extra flags) only `compute` is wired in on + some driver versions. Add `-e NVIDIA_DRIVER_CAPABILITIES=compute,utility` + to your `docker run`, or `capabilities: [gpu, utility]` in compose / + Kubernetes device reservations. +2. Pass `--init` to `docker run` (or `init: true` in compose) so the + container has a proper PID 1 reaper — otherwise short-lived child + processes like `nvidia-smi` can intermittently fail with + `waitid: no child processes`. + +On unified-memory devices LocalAI auto-detects the SoC via +`/sys/devices/soc0/{family,soc_id}` and reports system RAM as VRAM, so +`nvidia-smi` is not strictly required for VRAM metrics. See +[Distributed Mode → NVIDIA GPU support]({{% relref "/features/distributed-mode#nvidia-gpu-support" %}}) +for full context. diff --git a/pkg/xsysinfo/gpu.go b/pkg/xsysinfo/gpu.go index fe9f07aa4..e5db4404e 100644 --- a/pkg/xsysinfo/gpu.go +++ b/pkg/xsysinfo/gpu.go @@ -186,9 +186,10 @@ func DetectGPUVendor() (string, error) { return VendorIntel, nil } - // Check for NVIDIA Tegra/Jetson (no nvidia-smi on these devices) - if isTegraDevice() { - xlog.Debug("GPU vendor detected via Tegra SoC", "vendor", VendorNVIDIA) + // Check for NVIDIA integrated GPU (Tegra / DGX Spark / Thor) — + // nvidia-smi may be absent or unreliable on these unified-memory SoCs. + if isNVIDIAIntegratedGPU() { + xlog.Debug("GPU vendor detected via NVIDIA SoC", "vendor", VendorNVIDIA) return VendorNVIDIA, nil } @@ -254,10 +255,12 @@ func GetGPUMemoryUsage() []GPUMemoryInfo { gpus = append(gpus, intelGPUs...) } - // Try NVIDIA Tegra/Jetson (unified memory iGPU, no nvidia-smi) + // Try NVIDIA integrated GPUs (Tegra Jetson, DGX Spark, Thor — unified memory). + // These either lack nvidia-smi or have it behave unreliably, so we detect + // them via SoC sysfs and report system RAM figures. if len(gpus) == 0 { - tegraGPUs := getTegraGPUMemory() - gpus = append(gpus, tegraGPUs...) + integratedGPUs := getNVIDIAIntegratedGPUMemory() + gpus = append(gpus, integratedGPUs...) } // Try Vulkan as fallback for device detection (limited real-time data) @@ -365,12 +368,13 @@ func getNVIDIAGPUMemory() []GPUMemoryInfo { usagePercent = float64(usedBytes) / float64(totalBytes) * 100 } } else if isNA { - // Check if this is a Tegra/Jetson device — if so, it uses unified memory - if isTegraDevice() { - xlog.Debug("nvidia-smi returned N/A on Tegra device, using system RAM", "device", name) + // Check if this is an NVIDIA integrated / unified-memory SoC — if so, + // fall back to system RAM (covers Jetson, DGX Spark/GB10, Thor). + if isNVIDIAIntegratedGPU() { + xlog.Debug("nvidia-smi returned N/A on NVIDIA integrated GPU, using system RAM", "device", name) sysInfo, err := GetSystemRAMInfo() if err != nil { - xlog.Debug("failed to get system RAM for Tegra device", "error", err, "device", name) + xlog.Debug("failed to get system RAM for NVIDIA integrated GPU", "error", err, "device", name) gpus = append(gpus, GPUMemoryInfo{ Index: idx, Name: name, @@ -651,35 +655,73 @@ func getIntelGPUTop() []GPUMemoryInfo { return nil } -// isTegraDevice checks if the system is an NVIDIA Tegra/Jetson device. -// This works both on the host and inside Docker containers. -func isTegraDevice() bool { - data, err := os.ReadFile("/sys/devices/soc0/family") - if err == nil && strings.TrimSpace(string(data)) == "Tegra" { - return true +// isNVIDIAIntegratedGPU reports whether the host is an NVIDIA SoC with an +// integrated GPU that shares system RAM (unified memory). Covers the Jetson +// Tegra family (Orin, Xavier, Nano, AGX Thor) and SBSA-style NVIDIA SoCs such +// as the DGX Spark (GB10). nvidia-smi may be absent or unreliable on these +// hosts (notably when running under docker without NVML capability), so we +// detect via sysfs. Works both on the host and inside containers that mount +// /sys normally. +func isNVIDIAIntegratedGPU() bool { + if data, err := os.ReadFile("/sys/devices/soc0/family"); err == nil { + if strings.TrimSpace(string(data)) == "Tegra" { + return true + } + } + if data, err := os.ReadFile("/sys/devices/soc0/soc_id"); err == nil { + // JEDEC manufacturer 0x0426 = NVIDIA ("jep106:0426[:]"). + if strings.HasPrefix(strings.TrimSpace(string(data)), "jep106:0426") { + return true + } } return false } -// getTegraGPUMemory detects NVIDIA Tegra/Jetson iGPU. -// Jetson devices (Orin, Xavier, etc.) have an integrated GPU that shares -// system RAM (unified memory). They don't have nvidia-smi, so the normal -// NVIDIA detection path fails. We detect via /sys/devices/soc0/family. -func getTegraGPUMemory() []GPUMemoryInfo { - if !isTegraDevice() { +// nvidiaIntegratedGPUName derives a human-readable device name for an NVIDIA +// unified-memory SoC without relying on nvidia-smi. Priority: device-tree +// model (populated on Jetson) → soc0/machine (some Jetson devkits) → soc_id +// lookup (SBSA SoCs expose JEDEC IDs) → generic fallbacks. +func nvidiaIntegratedGPUName() string { + if data, err := os.ReadFile("/proc/device-tree/model"); err == nil { + if s := strings.TrimRight(string(data), "\x00 \n"); s != "" { + return s + } + } + if data, err := os.ReadFile("/sys/devices/soc0/machine"); err == nil { + if s := strings.TrimSpace(string(data)); s != "" { + return s + } + } + if data, err := os.ReadFile("/sys/devices/soc0/soc_id"); err == nil { + s := strings.TrimSpace(string(data)) + switch { + case strings.HasPrefix(s, "jep106:0426:8901"): + return "NVIDIA GB10" + case strings.HasPrefix(s, "jep106:0426"): + return "NVIDIA iGPU" + } + } + if data, err := os.ReadFile("/sys/devices/soc0/family"); err == nil { + if strings.TrimSpace(string(data)) == "Tegra" { + return "NVIDIA Jetson" + } + } + return "NVIDIA iGPU" +} + +// getNVIDIAIntegratedGPUMemory detects NVIDIA unified-memory integrated GPUs +// (Jetson, DGX Spark/GB10, Thor) and reports system RAM figures as VRAM. +// Used as a fallback when nvidia-smi is missing or failing. +func getNVIDIAIntegratedGPUMemory() []GPUMemoryInfo { + if !isNVIDIAIntegratedGPU() { return nil } - // Get device name from device tree - name := "NVIDIA Jetson" - if data, err := os.ReadFile("/proc/device-tree/model"); err == nil { - name = strings.TrimRight(string(data), "\x00 \n") - } + name := nvidiaIntegratedGPUName() - // Unified memory - use system RAM ramInfo, err := GetSystemRAMInfo() if err != nil { - xlog.Debug("Tegra detected but failed to get system RAM", "error", err) + xlog.Debug("NVIDIA integrated GPU detected but failed to get system RAM", "error", err, "device", name) return []GPUMemoryInfo{{ Index: 0, Name: name, @@ -692,7 +734,7 @@ func getTegraGPUMemory() []GPUMemoryInfo { usagePercent = float64(ramInfo.Used) / float64(ramInfo.Total) * 100 } - xlog.Debug("Tegra iGPU detected (unified memory)", "device", name, "total_ram", ramInfo.Total) + xlog.Debug("NVIDIA integrated GPU detected (unified memory)", "device", name, "total_ram", ramInfo.Total) return []GPUMemoryInfo{{ Index: 0, Name: name,