From 551ebdb57a098dc6e25aa52d37d432b50050e3f6 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 24 Apr 2026 22:02:23 +0200
Subject: [PATCH] fix(distributed): correct VRAM/RAM reporting on NVIDIA
 unified-memory hosts (#9545)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Workers on NVIDIA unified-memory hardware (DGX Spark / GB10, Jetson AGX Thor,
Jetson Orin/Xavier/Nano) were reporting `available_vram=0` back to the frontend,
so the Nodes UI showed the node as fully used even when most of the unified
memory was actually free.

Three causes addressed:

* `isTegraDevice` only matched `/sys/devices/soc0/family == "Tegra"`. DGX Spark
  (SBSA) reports JEDEC codes there instead — `jep106:0426` for the NVIDIA
  manufacturer — so the Tegra/unified-memory fallback never ran. Renamed to
  `isNVIDIAIntegratedGPU` and extended to also match `jep106:0426[:*]` via
  `/sys/devices/soc0/soc_id`.

* The unified-iGPU code defaulted the device name to `"NVIDIA Jetson"` when
  `/proc/device-tree/model` was missing. That's what happens for Thor inside a
  docker container, and always on DGX Spark. New `nvidiaIntegratedGPUName`
  resolves via dt-model → `/sys/devices/soc0/machine` → `soc_id` lookup
  (`jep106:0426:8901` → `"NVIDIA GB10"`) so the Nodes UI labels the box
  correctly.

* Worker heartbeat sent `available_vram=0` (or total-as-available) when VRAM
  usage was momentarily unknown — e.g. when `nvidia-smi` intermittently failed
  with `waitid: no child processes` under containers without `--init`. Each
  such heartbeat overwrote the DB and made the UI flip to "fully used".
  `heartbeatBody` now omits `available_vram` in that case so the DB keeps its
  last good value.

Also updates the commented GPU blocks in both compose files with
`NVIDIA_DRIVER_CAPABILITIES=compute,utility`, `capabilities: [gpu, utility]`,
and `init: true`, and documents the requirement in the distributed-mode and
nvidia-l4t pages. Without `utility`, NVML/`nvidia-smi` are absent inside the
container, which is what put the DGX Spark worker into the buggy fallback in
the first place.

Detection verified on live hardware (dgx.casa / GB10 and 192.168.68.23 / Thor)
by running a cross-compiled probe of the new helpers on both host and inside
the worker container.

Assisted-by: Claude:opus-4.7 [Claude Code]
---
 core/cli/worker.go                        |  20 ++---
 docker-compose.distributed.yaml           |  12 ++-
 docker-compose.yaml                       |  15 +++-
 docs/content/features/distributed-mode.md |  23 +++++
 docs/content/reference/nvidia-l4t.md      |  22 +++++
 pkg/xsysinfo/gpu.go                       | 102 +++++++++++++++-------
 6 files changed, 149 insertions(+), 45 deletions(-)

diff --git a/core/cli/worker.go b/core/cli/worker.go
index ecc53bb72..63591c0d7 100644
--- a/core/cli/worker.go
+++ b/core/cli/worker.go
@@ -924,21 +924,21 @@ func (cmd *WorkerCMD) registrationBody() map[string]any {
 }
 
 // heartbeatBody returns the current VRAM/RAM stats for heartbeat payloads.
+//
+// When aggregate VRAM usage is unknown (no GPU, or temporary detection
+// failure), we deliberately OMIT available_vram so the frontend keeps its
+// last good value — overwriting with 0 makes the UI show the node as "fully
+// used", while reporting total-as-available lies to the scheduler about
+// free capacity.
 func (cmd *WorkerCMD) heartbeatBody() map[string]any {
-	var availVRAM uint64
+	body := map[string]any{}
 	aggregate := xsysinfo.GetGPUAggregateInfo()
 	if aggregate.TotalVRAM > 0 {
-		availVRAM = aggregate.FreeVRAM
-	} else {
-		// Fallback: report total as available (no usage tracking possible)
-		availVRAM, _ = xsysinfo.TotalAvailableVRAM()
+		body["available_vram"] = aggregate.FreeVRAM
 	}
 
-	body := map[string]any{
-		"available_vram": availVRAM,
-	}
-
-	// If no GPU, report system RAM usage instead
+	// CPU-only workers (or workers that lost GPU visibility momentarily):
+	// report system RAM so the scheduler still has capacity info.
 	if aggregate.TotalVRAM == 0 {
 		if ramInfo, err := xsysinfo.GetSystemRAMInfo(); err == nil {
 			body["available_ram"] = ramInfo.Available
diff --git a/docker-compose.distributed.yaml b/docker-compose.distributed.yaml
index 04223c4c6..b8a081f1a 100644
--- a/docker-compose.distributed.yaml
+++ b/docker-compose.distributed.yaml
@@ -110,15 +110,23 @@ services:
 
     # --- GPU Support (NVIDIA) ---
     # Uncomment the following and change the image to a CUDA variant
-    # (e.g., localai/localai:latest-gpu-nvidia-cuda-12) to enable GPU:
+    # (e.g., localai/localai:latest-gpu-nvidia-cuda-12) to enable GPU.
     #
+    # NVIDIA_DRIVER_CAPABILITIES must include `utility` so nvidia-smi / NVML
+    # are available inside the container; without it the worker cannot report
+    # free VRAM and the Nodes page will show 0 free / total used.
+    # `init: true` avoids zombie-reap races that make nvidia-smi flaky.
+    #
+    # init: true
+    # environment:
+    #   NVIDIA_DRIVER_CAPABILITIES: "compute,utility"
     # deploy:
     #   resources:
     #     reservations:
     #       devices:
     #         - driver: nvidia.com/gpu
     #           count: all
-    #           capabilities: [gpu]
+    #           capabilities: [gpu, utility]
 
   # --- Shared Volume Mode (optional) ---
   # If all services run on the same Docker host, you can skip gRPC file transfer
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 7897509d6..a432a699a 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -40,23 +40,32 @@ services:
     # - https://gist.githubusercontent.com/mudler/ad601a0488b497b69ec549150d9edd18/raw/a8a8869ef1bb7e3830bf5c0bae29a0cce991ff8d/phi-2.yaml
     - phi-2
     # For NVIDIA GPU support with CDI (recommended for NVIDIA Container Toolkit 1.14+):
-    # Uncomment the following deploy section and use driver: nvidia.com/gpu
+    # Uncomment the following deploy section and use driver: nvidia.com/gpu.
+    # Include `utility` in capabilities so nvidia-smi / NVML are available —
+    # without it, free-VRAM reporting on discrete GPUs is unavailable and the
+    # Nodes UI will misreport memory usage.
+    # environment:
+    #   NVIDIA_DRIVER_CAPABILITIES: "compute,utility"
+    # init: true   # avoids zombie-reap races that can make nvidia-smi flaky
     # deploy:
     #   resources:
     #     reservations:
     #       devices:
     #         - driver: nvidia.com/gpu
     #           count: all
-    #           capabilities: [gpu]
+    #           capabilities: [gpu, utility]
     #
     # For legacy NVIDIA driver (for older NVIDIA Container Toolkit):
+    # environment:
+    #   NVIDIA_DRIVER_CAPABILITIES: "compute,utility"
+    # init: true
     # deploy:
     #   resources:
     #     reservations:
     #       devices:
     #         - driver: nvidia
     #           count: 1
-    #           capabilities: [gpu]
+    #           capabilities: [gpu, utility]
 
   ## Uncomment for PostgreSQL-backed knowledge base (see Agents docs)
   # postgres:
diff --git a/docs/content/features/distributed-mode.md b/docs/content/features/distributed-mode.md
index 79dad2605..773aa5284 100644
--- a/docs/content/features/distributed-mode.md
+++ b/docs/content/features/distributed-mode.md
@@ -160,6 +160,29 @@ For advanced networking scenarios (NAT, load balancers, separate gRPC/HTTP ports
 | `LOCALAI_ADVERTISE_ADDR` | Public gRPC address (if different from `LOCALAI_ADDR`) | Derived from `LOCALAI_ADDR` |
 | `LOCALAI_ADVERTISE_HTTP_ADDR` | Public HTTP address (if different from gRPC host) | Derived from advertise host + HTTP port |
 
+### NVIDIA GPU support
+
+When running workers in a container, two runtime settings affect how VRAM
+usage is reported back to the frontend:
+
+- **`NVIDIA_DRIVER_CAPABILITIES` must include `utility`.** Without it, the
+  NVML library (and therefore `nvidia-smi`) is not available inside the
+  container. CUDA compute still works, but the worker cannot query free VRAM
+  and the Nodes page will show the node as fully used. Set
+  `NVIDIA_DRIVER_CAPABILITIES=compute,utility` (or, with the NVIDIA CDI
+  runtime, list `capabilities: [gpu, utility]` on the device reservation).
+
+- **Run the container with `init: true` (or `docker run --init`).** The
+  worker process becomes PID 1 in the container and cannot reap zombies on
+  its own. Without an init, `nvidia-smi` calls can fail intermittently with
+  `waitid: no child processes`, which briefly clears free-VRAM metrics.
+
+**Unified memory devices (Jetson, DGX Spark / GB10, Thor):** these SoCs
+share one physical RAM between CPU and GPU. LocalAI detects them via
+`/sys/devices/soc0/family` and `/sys/devices/soc0/soc_id` (no `nvidia-smi`
+required) and reports system-RAM figures as VRAM. Free VRAM therefore tracks
+`MemAvailable` in `/proc/meminfo`.
+
 ### Node Labels
 
 Workers can declare labels at startup for scheduling constraints:
diff --git a/docs/content/reference/nvidia-l4t.md b/docs/content/reference/nvidia-l4t.md
index 9cc81c09b..1edbe3aeb 100644
--- a/docs/content/reference/nvidia-l4t.md
+++ b/docs/content/reference/nvidia-l4t.md
@@ -78,3 +78,25 @@ docker run -e DEBUG=true -p 8080:8080 -v /data/models:/models -ti --restart=alwa
 ```
 
 Note: `/data/models` is the directory containing the models. You can replace it with the directory containing your models.
+
+## GPU reporting in distributed mode
+
+If you run a worker on a Jetson, DGX Spark (GB10), or Thor and the Nodes
+page in the frontend shows the node as fully used, check two things:
+
+1. `NVIDIA_DRIVER_CAPABILITIES` must include `utility` so `nvidia-smi` /
+   NVML work inside the container. With `--gpus all` alone (or
+   `--runtime nvidia` without extra flags) only `compute` is wired in on
+   some driver versions. Add `-e NVIDIA_DRIVER_CAPABILITIES=compute,utility`
+   to your `docker run`, or `capabilities: [gpu, utility]` in compose /
+   Kubernetes device reservations.
+2. Pass `--init` to `docker run` (or `init: true` in compose) so the
+   container has a proper PID 1 reaper — otherwise short-lived child
+   processes like `nvidia-smi` can intermittently fail with
+   `waitid: no child processes`.
+
+On unified-memory devices LocalAI auto-detects the SoC via
+`/sys/devices/soc0/{family,soc_id}` and reports system RAM as VRAM, so
+`nvidia-smi` is not strictly required for VRAM metrics. See
+[Distributed Mode → NVIDIA GPU support]({{% relref "/features/distributed-mode#nvidia-gpu-support" %}})
+for full context.
diff --git a/pkg/xsysinfo/gpu.go b/pkg/xsysinfo/gpu.go
index fe9f07aa4..e5db4404e 100644
--- a/pkg/xsysinfo/gpu.go
+++ b/pkg/xsysinfo/gpu.go
@@ -186,9 +186,10 @@ func DetectGPUVendor() (string, error) {
 		return VendorIntel, nil
 	}
 
-	// Check for NVIDIA Tegra/Jetson (no nvidia-smi on these devices)
-	if isTegraDevice() {
-		xlog.Debug("GPU vendor detected via Tegra SoC", "vendor", VendorNVIDIA)
+	// Check for NVIDIA integrated GPU (Tegra / DGX Spark / Thor) —
+	// nvidia-smi may be absent or unreliable on these unified-memory SoCs.
+	if isNVIDIAIntegratedGPU() {
+		xlog.Debug("GPU vendor detected via NVIDIA SoC", "vendor", VendorNVIDIA)
 		return VendorNVIDIA, nil
 	}
 
@@ -254,10 +255,12 @@ func GetGPUMemoryUsage() []GPUMemoryInfo {
 		gpus = append(gpus, intelGPUs...)
 	}
 
-	// Try NVIDIA Tegra/Jetson (unified memory iGPU, no nvidia-smi)
+	// Try NVIDIA integrated GPUs (Tegra Jetson, DGX Spark, Thor — unified memory).
+	// These either lack nvidia-smi or have it behave unreliably, so we detect
+	// them via SoC sysfs and report system RAM figures.
 	if len(gpus) == 0 {
-		tegraGPUs := getTegraGPUMemory()
-		gpus = append(gpus, tegraGPUs...)
+		integratedGPUs := getNVIDIAIntegratedGPUMemory()
+		gpus = append(gpus, integratedGPUs...)
 	}
 
 	// Try Vulkan as fallback for device detection (limited real-time data)
@@ -365,12 +368,13 @@ func getNVIDIAGPUMemory() []GPUMemoryInfo {
 				usagePercent = float64(usedBytes) / float64(totalBytes) * 100
 			}
 		} else if isNA {
-			// Check if this is a Tegra/Jetson device — if so, it uses unified memory
-			if isTegraDevice() {
-				xlog.Debug("nvidia-smi returned N/A on Tegra device, using system RAM", "device", name)
+			// Check if this is an NVIDIA integrated / unified-memory SoC — if so,
+			// fall back to system RAM (covers Jetson, DGX Spark/GB10, Thor).
+			if isNVIDIAIntegratedGPU() {
+				xlog.Debug("nvidia-smi returned N/A on NVIDIA integrated GPU, using system RAM", "device", name)
 				sysInfo, err := GetSystemRAMInfo()
 				if err != nil {
-					xlog.Debug("failed to get system RAM for Tegra device", "error", err, "device", name)
+					xlog.Debug("failed to get system RAM for NVIDIA integrated GPU", "error", err, "device", name)
 					gpus = append(gpus, GPUMemoryInfo{
 						Index:        idx,
 						Name:         name,
@@ -651,35 +655,73 @@ func getIntelGPUTop() []GPUMemoryInfo {
 	return nil
 }
 
-// isTegraDevice checks if the system is an NVIDIA Tegra/Jetson device.
-// This works both on the host and inside Docker containers.
-func isTegraDevice() bool {
-	data, err := os.ReadFile("/sys/devices/soc0/family")
-	if err == nil && strings.TrimSpace(string(data)) == "Tegra" {
-		return true
+// isNVIDIAIntegratedGPU reports whether the host is an NVIDIA SoC with an
+// integrated GPU that shares system RAM (unified memory). Covers the Jetson
+// Tegra family (Orin, Xavier, Nano, AGX Thor) and SBSA-style NVIDIA SoCs such
+// as the DGX Spark (GB10). nvidia-smi may be absent or unreliable on these
+// hosts (notably when running under docker without NVML capability), so we
+// detect via sysfs. Works both on the host and inside containers that mount
+// /sys normally.
+func isNVIDIAIntegratedGPU() bool {
+	if data, err := os.ReadFile("/sys/devices/soc0/family"); err == nil {
+		if strings.TrimSpace(string(data)) == "Tegra" {
+			return true
+		}
+	}
+	if data, err := os.ReadFile("/sys/devices/soc0/soc_id"); err == nil {
+		// JEDEC manufacturer 0x0426 = NVIDIA ("jep106:0426[:<soc>]").
+		if strings.HasPrefix(strings.TrimSpace(string(data)), "jep106:0426") {
+			return true
+		}
 	}
 	return false
 }
 
-// getTegraGPUMemory detects NVIDIA Tegra/Jetson iGPU.
-// Jetson devices (Orin, Xavier, etc.) have an integrated GPU that shares
-// system RAM (unified memory). They don't have nvidia-smi, so the normal
-// NVIDIA detection path fails. We detect via /sys/devices/soc0/family.
-func getTegraGPUMemory() []GPUMemoryInfo {
-	if !isTegraDevice() {
+// nvidiaIntegratedGPUName derives a human-readable device name for an NVIDIA
+// unified-memory SoC without relying on nvidia-smi. Priority: device-tree
+// model (populated on Jetson) → soc0/machine (some Jetson devkits) → soc_id
+// lookup (SBSA SoCs expose JEDEC IDs) → generic fallbacks.
+func nvidiaIntegratedGPUName() string {
+	if data, err := os.ReadFile("/proc/device-tree/model"); err == nil {
+		if s := strings.TrimRight(string(data), "\x00 \n"); s != "" {
+			return s
+		}
+	}
+	if data, err := os.ReadFile("/sys/devices/soc0/machine"); err == nil {
+		if s := strings.TrimSpace(string(data)); s != "" {
+			return s
+		}
+	}
+	if data, err := os.ReadFile("/sys/devices/soc0/soc_id"); err == nil {
+		s := strings.TrimSpace(string(data))
+		switch {
+		case strings.HasPrefix(s, "jep106:0426:8901"):
+			return "NVIDIA GB10"
+		case strings.HasPrefix(s, "jep106:0426"):
+			return "NVIDIA iGPU"
+		}
+	}
+	if data, err := os.ReadFile("/sys/devices/soc0/family"); err == nil {
+		if strings.TrimSpace(string(data)) == "Tegra" {
+			return "NVIDIA Jetson"
+		}
+	}
+	return "NVIDIA iGPU"
+}
+
+// getNVIDIAIntegratedGPUMemory detects NVIDIA unified-memory integrated GPUs
+// (Jetson, DGX Spark/GB10, Thor) and reports system RAM figures as VRAM.
+// Used as a fallback when nvidia-smi is missing or failing.
+func getNVIDIAIntegratedGPUMemory() []GPUMemoryInfo {
+	if !isNVIDIAIntegratedGPU() {
 		return nil
 	}
 
-	// Get device name from device tree
-	name := "NVIDIA Jetson"
-	if data, err := os.ReadFile("/proc/device-tree/model"); err == nil {
-		name = strings.TrimRight(string(data), "\x00 \n")
-	}
+	name := nvidiaIntegratedGPUName()
 
-	// Unified memory - use system RAM
 	ramInfo, err := GetSystemRAMInfo()
 	if err != nil {
-		xlog.Debug("Tegra detected but failed to get system RAM", "error", err)
+		xlog.Debug("NVIDIA integrated GPU detected but failed to get system RAM", "error", err, "device", name)
 		return []GPUMemoryInfo{{
 			Index:  0,
 			Name:   name,
@@ -692,7 +734,7 @@ func getTegraGPUMemory() []GPUMemoryInfo {
 		usagePercent = float64(ramInfo.Used) / float64(ramInfo.Total) * 100
 	}
 
-	xlog.Debug("Tegra iGPU detected (unified memory)", "device", name, "total_ram", ramInfo.Total)
+	xlog.Debug("NVIDIA integrated GPU detected (unified memory)", "device", name, "total_ram", ramInfo.Total)
 	return []GPUMemoryInfo{{
 		Index:        0,
 		Name:         name,