fix(distributed): correct VRAM/RAM reporting on NVIDIA unified-memory hosts (#9545)

Workers on NVIDIA unified-memory hardware (DGX Spark / GB10, Jetson AGX Thor,
Jetson Orin/Xavier/Nano) were reporting `available_vram=0` back to the frontend,
so the Nodes UI showed the node as fully used even when most of the unified
memory was actually free.

Three causes addressed:

* `isTegraDevice` only matched `/sys/devices/soc0/family == "Tegra"`. DGX Spark
  (SBSA) reports JEDEC codes there instead — `jep106:0426` for the NVIDIA
  manufacturer — so the Tegra/unified-memory fallback never ran. Renamed to
  `isNVIDIAIntegratedGPU` and extended to also match `jep106:0426[:*]` via
  `/sys/devices/soc0/soc_id`.

* The unified-iGPU code defaulted the device name to `"NVIDIA Jetson"` when
  `/proc/device-tree/model` was missing. That's what happens for Thor inside a
  docker container, and always on DGX Spark. New `nvidiaIntegratedGPUName`
  resolves via dt-model → `/sys/devices/soc0/machine` → `soc_id` lookup
  (`jep106:0426:8901` → `"NVIDIA GB10"`) so the Nodes UI labels the box
  correctly.

* Worker heartbeat sent `available_vram=0` (or total-as-available) when VRAM
  usage was momentarily unknown — e.g. when `nvidia-smi` intermittently failed
  with `waitid: no child processes` under containers without `--init`. Each
  such heartbeat overwrote the DB and made the UI flip to "fully used".
  `heartbeatBody` now omits `available_vram` in that case so the DB keeps its
  last good value.

Also updates the commented GPU blocks in both compose files with
`NVIDIA_DRIVER_CAPABILITIES=compute,utility`, `capabilities: [gpu, utility]`,
and `init: true`, and documents the requirement in the distributed-mode and
nvidia-l4t pages. Without `utility`, NVML/`nvidia-smi` are absent inside the
container, which is what put the DGX Spark worker into the buggy fallback in
the first place.

Detection verified on live hardware (dgx.casa / GB10 and 192.168.68.23 / Thor)
by running a cross-compiled probe of the new helpers on both host and inside
the worker container.

Assisted-by: Claude:opus-4.7 [Claude Code]
This commit is contained in:
Ettore Di Giacinto
2026-04-24 22:02:23 +02:00
committed by GitHub
parent 1d0de757c3
commit 551ebdb57a
6 changed files with 149 additions and 45 deletions

View File

@@ -924,21 +924,21 @@ func (cmd *WorkerCMD) registrationBody() map[string]any {
}
// heartbeatBody returns the current VRAM/RAM stats for heartbeat payloads.
//
// When aggregate VRAM usage is unknown (no GPU, or temporary detection
// failure), we deliberately OMIT available_vram so the frontend keeps its
// last good value — overwriting with 0 makes the UI show the node as "fully
// used", while reporting total-as-available lies to the scheduler about
// free capacity.
func (cmd *WorkerCMD) heartbeatBody() map[string]any {
var availVRAM uint64
body := map[string]any{}
aggregate := xsysinfo.GetGPUAggregateInfo()
if aggregate.TotalVRAM > 0 {
availVRAM = aggregate.FreeVRAM
} else {
// Fallback: report total as available (no usage tracking possible)
availVRAM, _ = xsysinfo.TotalAvailableVRAM()
body["available_vram"] = aggregate.FreeVRAM
}
body := map[string]any{
"available_vram": availVRAM,
}
// If no GPU, report system RAM usage instead
// CPU-only workers (or workers that lost GPU visibility momentarily):
// report system RAM so the scheduler still has capacity info.
if aggregate.TotalVRAM == 0 {
if ramInfo, err := xsysinfo.GetSystemRAMInfo(); err == nil {
body["available_ram"] = ramInfo.Available

View File

@@ -110,15 +110,23 @@ services:
# --- GPU Support (NVIDIA) ---
# Uncomment the following and change the image to a CUDA variant
# (e.g., localai/localai:latest-gpu-nvidia-cuda-12) to enable GPU:
# (e.g., localai/localai:latest-gpu-nvidia-cuda-12) to enable GPU.
#
# NVIDIA_DRIVER_CAPABILITIES must include `utility` so nvidia-smi / NVML
# are available inside the container; without it the worker cannot report
# free VRAM and the Nodes page will show 0 free / total used.
# `init: true` avoids zombie-reap races that make nvidia-smi flaky.
#
# init: true
# environment:
# NVIDIA_DRIVER_CAPABILITIES: "compute,utility"
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia.com/gpu
# count: all
# capabilities: [gpu]
# capabilities: [gpu, utility]
# --- Shared Volume Mode (optional) ---
# If all services run on the same Docker host, you can skip gRPC file transfer

View File

@@ -40,23 +40,32 @@ services:
# - https://gist.githubusercontent.com/mudler/ad601a0488b497b69ec549150d9edd18/raw/a8a8869ef1bb7e3830bf5c0bae29a0cce991ff8d/phi-2.yaml
- phi-2
# For NVIDIA GPU support with CDI (recommended for NVIDIA Container Toolkit 1.14+):
# Uncomment the following deploy section and use driver: nvidia.com/gpu
# Uncomment the following deploy section and use driver: nvidia.com/gpu.
# Include `utility` in capabilities so nvidia-smi / NVML are available —
# without it, free-VRAM reporting on discrete GPUs is unavailable and the
# Nodes UI will misreport memory usage.
# environment:
# NVIDIA_DRIVER_CAPABILITIES: "compute,utility"
# init: true # avoids zombie-reap races that can make nvidia-smi flaky
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia.com/gpu
# count: all
# capabilities: [gpu]
# capabilities: [gpu, utility]
#
# For legacy NVIDIA driver (for older NVIDIA Container Toolkit):
# environment:
# NVIDIA_DRIVER_CAPABILITIES: "compute,utility"
# init: true
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: 1
# capabilities: [gpu]
# capabilities: [gpu, utility]
## Uncomment for PostgreSQL-backed knowledge base (see Agents docs)
# postgres:

View File

@@ -160,6 +160,29 @@ For advanced networking scenarios (NAT, load balancers, separate gRPC/HTTP ports
| `LOCALAI_ADVERTISE_ADDR` | Public gRPC address (if different from `LOCALAI_ADDR`) | Derived from `LOCALAI_ADDR` |
| `LOCALAI_ADVERTISE_HTTP_ADDR` | Public HTTP address (if different from gRPC host) | Derived from advertise host + HTTP port |
### NVIDIA GPU support
When running workers in a container, two runtime settings affect how VRAM
usage is reported back to the frontend:
- **`NVIDIA_DRIVER_CAPABILITIES` must include `utility`.** Without it, the
NVML library (and therefore `nvidia-smi`) is not available inside the
container. CUDA compute still works, but the worker cannot query free VRAM
and the Nodes page will show the node as fully used. Set
`NVIDIA_DRIVER_CAPABILITIES=compute,utility` (or, with the NVIDIA CDI
runtime, list `capabilities: [gpu, utility]` on the device reservation).
- **Run the container with `init: true` (or `docker run --init`).** The
worker process becomes PID 1 in the container and cannot reap zombies on
its own. Without an init, `nvidia-smi` calls can fail intermittently with
`waitid: no child processes`, which briefly clears free-VRAM metrics.
**Unified memory devices (Jetson, DGX Spark / GB10, Thor):** these SoCs
share one physical RAM between CPU and GPU. LocalAI detects them via
`/sys/devices/soc0/family` and `/sys/devices/soc0/soc_id` (no `nvidia-smi`
required) and reports system-RAM figures as VRAM. Free VRAM therefore tracks
`MemAvailable` in `/proc/meminfo`.
### Node Labels
Workers can declare labels at startup for scheduling constraints:

View File

@@ -78,3 +78,25 @@ docker run -e DEBUG=true -p 8080:8080 -v /data/models:/models -ti --restart=alwa
```
Note: `/data/models` is the directory containing the models. You can replace it with the directory containing your models.
## GPU reporting in distributed mode
If you run a worker on a Jetson, DGX Spark (GB10), or Thor and the Nodes
page in the frontend shows the node as fully used, check two things:
1. `NVIDIA_DRIVER_CAPABILITIES` must include `utility` so `nvidia-smi` /
NVML work inside the container. With `--gpus all` alone (or
`--runtime nvidia` without extra flags) only `compute` is wired in on
some driver versions. Add `-e NVIDIA_DRIVER_CAPABILITIES=compute,utility`
to your `docker run`, or `capabilities: [gpu, utility]` in compose /
Kubernetes device reservations.
2. Pass `--init` to `docker run` (or `init: true` in compose) so the
container has a proper PID 1 reaper — otherwise short-lived child
processes like `nvidia-smi` can intermittently fail with
`waitid: no child processes`.
On unified-memory devices LocalAI auto-detects the SoC via
`/sys/devices/soc0/{family,soc_id}` and reports system RAM as VRAM, so
`nvidia-smi` is not strictly required for VRAM metrics. See
[Distributed Mode → NVIDIA GPU support]({{% relref "/features/distributed-mode#nvidia-gpu-support" %}})
for full context.

View File

@@ -186,9 +186,10 @@ func DetectGPUVendor() (string, error) {
return VendorIntel, nil
}
// Check for NVIDIA Tegra/Jetson (no nvidia-smi on these devices)
if isTegraDevice() {
xlog.Debug("GPU vendor detected via Tegra SoC", "vendor", VendorNVIDIA)
// Check for NVIDIA integrated GPU (Tegra / DGX Spark / Thor) —
// nvidia-smi may be absent or unreliable on these unified-memory SoCs.
if isNVIDIAIntegratedGPU() {
xlog.Debug("GPU vendor detected via NVIDIA SoC", "vendor", VendorNVIDIA)
return VendorNVIDIA, nil
}
@@ -254,10 +255,12 @@ func GetGPUMemoryUsage() []GPUMemoryInfo {
gpus = append(gpus, intelGPUs...)
}
// Try NVIDIA Tegra/Jetson (unified memory iGPU, no nvidia-smi)
// Try NVIDIA integrated GPUs (Tegra Jetson, DGX Spark, Thor — unified memory).
// These either lack nvidia-smi or have it behave unreliably, so we detect
// them via SoC sysfs and report system RAM figures.
if len(gpus) == 0 {
tegraGPUs := getTegraGPUMemory()
gpus = append(gpus, tegraGPUs...)
integratedGPUs := getNVIDIAIntegratedGPUMemory()
gpus = append(gpus, integratedGPUs...)
}
// Try Vulkan as fallback for device detection (limited real-time data)
@@ -365,12 +368,13 @@ func getNVIDIAGPUMemory() []GPUMemoryInfo {
usagePercent = float64(usedBytes) / float64(totalBytes) * 100
}
} else if isNA {
// Check if this is a Tegra/Jetson device — if so, it uses unified memory
if isTegraDevice() {
xlog.Debug("nvidia-smi returned N/A on Tegra device, using system RAM", "device", name)
// Check if this is an NVIDIA integrated / unified-memory SoC — if so,
// fall back to system RAM (covers Jetson, DGX Spark/GB10, Thor).
if isNVIDIAIntegratedGPU() {
xlog.Debug("nvidia-smi returned N/A on NVIDIA integrated GPU, using system RAM", "device", name)
sysInfo, err := GetSystemRAMInfo()
if err != nil {
xlog.Debug("failed to get system RAM for Tegra device", "error", err, "device", name)
xlog.Debug("failed to get system RAM for NVIDIA integrated GPU", "error", err, "device", name)
gpus = append(gpus, GPUMemoryInfo{
Index: idx,
Name: name,
@@ -651,35 +655,73 @@ func getIntelGPUTop() []GPUMemoryInfo {
return nil
}
// isTegraDevice checks if the system is an NVIDIA Tegra/Jetson device.
// This works both on the host and inside Docker containers.
func isTegraDevice() bool {
data, err := os.ReadFile("/sys/devices/soc0/family")
if err == nil && strings.TrimSpace(string(data)) == "Tegra" {
return true
// isNVIDIAIntegratedGPU reports whether the host is an NVIDIA SoC with an
// integrated GPU that shares system RAM (unified memory). Covers the Jetson
// Tegra family (Orin, Xavier, Nano, AGX Thor) and SBSA-style NVIDIA SoCs such
// as the DGX Spark (GB10). nvidia-smi may be absent or unreliable on these
// hosts (notably when running under docker without NVML capability), so we
// detect via sysfs. Works both on the host and inside containers that mount
// /sys normally.
func isNVIDIAIntegratedGPU() bool {
if data, err := os.ReadFile("/sys/devices/soc0/family"); err == nil {
if strings.TrimSpace(string(data)) == "Tegra" {
return true
}
}
if data, err := os.ReadFile("/sys/devices/soc0/soc_id"); err == nil {
// JEDEC manufacturer 0x0426 = NVIDIA ("jep106:0426[:<soc>]").
if strings.HasPrefix(strings.TrimSpace(string(data)), "jep106:0426") {
return true
}
}
return false
}
// getTegraGPUMemory detects NVIDIA Tegra/Jetson iGPU.
// Jetson devices (Orin, Xavier, etc.) have an integrated GPU that shares
// system RAM (unified memory). They don't have nvidia-smi, so the normal
// NVIDIA detection path fails. We detect via /sys/devices/soc0/family.
func getTegraGPUMemory() []GPUMemoryInfo {
if !isTegraDevice() {
// nvidiaIntegratedGPUName derives a human-readable device name for an NVIDIA
// unified-memory SoC without relying on nvidia-smi. Priority: device-tree
// model (populated on Jetson) → soc0/machine (some Jetson devkits) → soc_id
// lookup (SBSA SoCs expose JEDEC IDs) → generic fallbacks.
func nvidiaIntegratedGPUName() string {
if data, err := os.ReadFile("/proc/device-tree/model"); err == nil {
if s := strings.TrimRight(string(data), "\x00 \n"); s != "" {
return s
}
}
if data, err := os.ReadFile("/sys/devices/soc0/machine"); err == nil {
if s := strings.TrimSpace(string(data)); s != "" {
return s
}
}
if data, err := os.ReadFile("/sys/devices/soc0/soc_id"); err == nil {
s := strings.TrimSpace(string(data))
switch {
case strings.HasPrefix(s, "jep106:0426:8901"):
return "NVIDIA GB10"
case strings.HasPrefix(s, "jep106:0426"):
return "NVIDIA iGPU"
}
}
if data, err := os.ReadFile("/sys/devices/soc0/family"); err == nil {
if strings.TrimSpace(string(data)) == "Tegra" {
return "NVIDIA Jetson"
}
}
return "NVIDIA iGPU"
}
// getNVIDIAIntegratedGPUMemory detects NVIDIA unified-memory integrated GPUs
// (Jetson, DGX Spark/GB10, Thor) and reports system RAM figures as VRAM.
// Used as a fallback when nvidia-smi is missing or failing.
func getNVIDIAIntegratedGPUMemory() []GPUMemoryInfo {
if !isNVIDIAIntegratedGPU() {
return nil
}
// Get device name from device tree
name := "NVIDIA Jetson"
if data, err := os.ReadFile("/proc/device-tree/model"); err == nil {
name = strings.TrimRight(string(data), "\x00 \n")
}
name := nvidiaIntegratedGPUName()
// Unified memory - use system RAM
ramInfo, err := GetSystemRAMInfo()
if err != nil {
xlog.Debug("Tegra detected but failed to get system RAM", "error", err)
xlog.Debug("NVIDIA integrated GPU detected but failed to get system RAM", "error", err, "device", name)
return []GPUMemoryInfo{{
Index: 0,
Name: name,
@@ -692,7 +734,7 @@ func getTegraGPUMemory() []GPUMemoryInfo {
usagePercent = float64(ramInfo.Used) / float64(ramInfo.Total) * 100
}
xlog.Debug("Tegra iGPU detected (unified memory)", "device", name, "total_ram", ramInfo.Total)
xlog.Debug("NVIDIA integrated GPU detected (unified memory)", "device", name, "total_ram", ramInfo.Total)
return []GPUMemoryInfo{{
Index: 0,
Name: name,