Files
LocalAI/pkg/xsysinfo/gpu.go
Ettore Di Giacinto 551ebdb57a fix(distributed): correct VRAM/RAM reporting on NVIDIA unified-memory hosts (#9545)
Workers on NVIDIA unified-memory hardware (DGX Spark / GB10, Jetson AGX Thor,
Jetson Orin/Xavier/Nano) were reporting `available_vram=0` back to the frontend,
so the Nodes UI showed the node as fully used even when most of the unified
memory was actually free.

Three causes addressed:

* `isTegraDevice` only matched `/sys/devices/soc0/family == "Tegra"`. DGX Spark
  (SBSA) reports JEDEC codes there instead — `jep106:0426` for the NVIDIA
  manufacturer — so the Tegra/unified-memory fallback never ran. Renamed to
  `isNVIDIAIntegratedGPU` and extended to also match `jep106:0426[:*]` via
  `/sys/devices/soc0/soc_id`.

* The unified-iGPU code defaulted the device name to `"NVIDIA Jetson"` when
  `/proc/device-tree/model` was missing. That's what happens for Thor inside a
  docker container, and always on DGX Spark. New `nvidiaIntegratedGPUName`
  resolves via dt-model → `/sys/devices/soc0/machine` → `soc_id` lookup
  (`jep106:0426:8901` → `"NVIDIA GB10"`) so the Nodes UI labels the box
  correctly.

* Worker heartbeat sent `available_vram=0` (or total-as-available) when VRAM
  usage was momentarily unknown — e.g. when `nvidia-smi` intermittently failed
  with `waitid: no child processes` under containers without `--init`. Each
  such heartbeat overwrote the DB and made the UI flip to "fully used".
  `heartbeatBody` now omits `available_vram` in that case so the DB keeps its
  last good value.

Also updates the commented GPU blocks in both compose files with
`NVIDIA_DRIVER_CAPABILITIES=compute,utility`, `capabilities: [gpu, utility]`,
and `init: true`, and documents the requirement in the distributed-mode and
nvidia-l4t pages. Without `utility`, NVML/`nvidia-smi` are absent inside the
container, which is what put the DGX Spark worker into the buggy fallback in
the first place.

Detection verified on live hardware (dgx.casa / GB10 and 192.168.68.23 / Thor)
by running a cross-compiled probe of the new helpers on both host and inside
the worker container.

Assisted-by: Claude:opus-4.7 [Claude Code]
2026-04-24 22:02:23 +02:00

956 lines
26 KiB
Go

package xsysinfo
import (
"bytes"
"encoding/json"
"os"
"os/exec"
"strconv"
"strings"
"sync"
"github.com/jaypipes/ghw"
"github.com/jaypipes/ghw/pkg/gpu"
"github.com/mudler/xlog"
)
// GPU vendor constants
const (
VendorNVIDIA = "nvidia"
VendorAMD = "amd"
VendorIntel = "intel"
VendorApple = "apple"
VendorVulkan = "vulkan"
VendorUnknown = "unknown"
)
// UnifiedMemoryDevices is a list of GPU device name patterns that use unified memory
// (shared with system RAM). When these devices are detected and report N/A for VRAM,
// we fall back to system RAM information.
var UnifiedMemoryDevices = []string{
"NVIDIA GB10",
"GB10",
"NVIDIA Thor",
"Thor",
}
// GPUMemoryInfo contains real-time GPU memory usage information
type GPUMemoryInfo struct {
Index int `json:"index"`
Name string `json:"name"`
Vendor string `json:"vendor"`
TotalVRAM uint64 `json:"total_vram"` // Total VRAM in bytes
UsedVRAM uint64 `json:"used_vram"` // Used VRAM in bytes
FreeVRAM uint64 `json:"free_vram"` // Free VRAM in bytes
UsagePercent float64 `json:"usage_percent"` // Usage as percentage (0-100)
}
// GPUAggregateInfo contains aggregate GPU information across all GPUs
type GPUAggregateInfo struct {
TotalVRAM uint64 `json:"total_vram"`
UsedVRAM uint64 `json:"used_vram"`
FreeVRAM uint64 `json:"free_vram"`
UsagePercent float64 `json:"usage_percent"`
GPUCount int `json:"gpu_count"`
}
// AggregateMemoryInfo contains aggregate memory information (unified for GPU/RAM)
type AggregateMemoryInfo struct {
TotalMemory uint64 `json:"total_memory"`
UsedMemory uint64 `json:"used_memory"`
FreeMemory uint64 `json:"free_memory"`
UsagePercent float64 `json:"usage_percent"`
GPUCount int `json:"gpu_count"`
}
// ResourceInfo represents unified memory resource information
type ResourceInfo struct {
Type string `json:"type"` // "gpu" or "ram"
Available bool `json:"available"`
GPUs []GPUMemoryInfo `json:"gpus,omitempty"`
RAM *SystemRAMInfo `json:"ram,omitempty"`
Aggregate AggregateMemoryInfo `json:"aggregate"`
}
var gpusOnce = sync.OnceValues(func() ([]*gpu.GraphicsCard, error) {
gpu, err := ghw.GPU()
if err != nil {
return nil, err
}
return gpu.GraphicsCards, nil
})
func GPUs() ([]*gpu.GraphicsCard, error) {
return gpusOnce()
}
func TotalAvailableVRAM() (uint64, error) {
// First, try ghw library detection
gpus, err := GPUs()
if err == nil {
var totalVRAM uint64
for _, gpu := range gpus {
if gpu != nil && gpu.Node != nil && gpu.Node.Memory != nil {
if gpu.Node.Memory.TotalUsableBytes > 0 {
totalVRAM += uint64(gpu.Node.Memory.TotalUsableBytes)
}
}
}
// If we got valid VRAM from ghw, return it
if totalVRAM > 0 {
return totalVRAM, nil
}
}
// Fallback to binary-based detection via GetGPUMemoryUsage()
// This works even when ghw dependencies are missing from the base image
gpuMemoryInfo := GetGPUMemoryUsage()
if len(gpuMemoryInfo) > 0 {
var totalVRAM uint64
for _, gpu := range gpuMemoryInfo {
totalVRAM += gpu.TotalVRAM
}
if totalVRAM > 0 {
xlog.Debug("VRAM detected via binary tools", "total_vram", totalVRAM)
return totalVRAM, nil
}
}
// No VRAM detected
return 0, nil
}
func HasGPU(vendor string) bool {
gpus, err := GPUs()
if err != nil {
return false
}
if vendor == "" {
return len(gpus) > 0
}
for _, gpu := range gpus {
if strings.Contains(gpu.String(), vendor) {
return true
}
}
return false
}
// DetectGPUVendor detects the GPU vendor using multiple methods with fallbacks.
// First tries ghw library, then falls back to binary detection.
// Returns vendor string (VendorNVIDIA, VendorAMD, VendorIntel, VendorVulkan) or empty string if not detected.
// Priority order: NVIDIA > AMD > Intel > Vulkan
func DetectGPUVendor() (string, error) {
// First, try ghw library detection
gpus, err := GPUs()
if err == nil && len(gpus) > 0 {
for _, gpu := range gpus {
if gpu.DeviceInfo != nil && gpu.DeviceInfo.Vendor != nil {
vendorName := strings.ToUpper(gpu.DeviceInfo.Vendor.Name)
if strings.Contains(vendorName, strings.ToUpper(VendorNVIDIA)) {
xlog.Debug("GPU vendor detected via ghw", "vendor", VendorNVIDIA)
return VendorNVIDIA, nil
}
if strings.Contains(vendorName, strings.ToUpper(VendorAMD)) {
xlog.Debug("GPU vendor detected via ghw", "vendor", VendorAMD)
return VendorAMD, nil
}
if strings.Contains(vendorName, strings.ToUpper(VendorIntel)) {
xlog.Debug("GPU vendor detected via ghw", "vendor", VendorIntel)
return VendorIntel, nil
}
}
}
}
// Fallback to binary detection (priority: NVIDIA > AMD > Intel > Vulkan)
// Check for nvidia-smi
if _, err := exec.LookPath("nvidia-smi"); err == nil {
xlog.Debug("GPU vendor detected via binary", "vendor", VendorNVIDIA, "binary", "nvidia-smi")
return VendorNVIDIA, nil
}
// Check for rocm-smi (AMD)
if _, err := exec.LookPath("rocm-smi"); err == nil {
xlog.Debug("GPU vendor detected via binary", "vendor", VendorAMD, "binary", "rocm-smi")
return VendorAMD, nil
}
// Check for xpu-smi or intel_gpu_top (Intel)
if _, err := exec.LookPath("xpu-smi"); err == nil {
xlog.Debug("GPU vendor detected via binary", "vendor", VendorIntel, "binary", "xpu-smi")
return VendorIntel, nil
}
if _, err := exec.LookPath("intel_gpu_top"); err == nil {
xlog.Debug("GPU vendor detected via binary", "vendor", VendorIntel, "binary", "intel_gpu_top")
return VendorIntel, nil
}
// Check for NVIDIA integrated GPU (Tegra / DGX Spark / Thor) —
// nvidia-smi may be absent or unreliable on these unified-memory SoCs.
if isNVIDIAIntegratedGPU() {
xlog.Debug("GPU vendor detected via NVIDIA SoC", "vendor", VendorNVIDIA)
return VendorNVIDIA, nil
}
// Check for vulkaninfo (Vulkan - lowest priority as it can detect any GPU)
if _, err := exec.LookPath("vulkaninfo"); err == nil {
xlog.Debug("GPU vendor detected via binary", "vendor", VendorVulkan, "binary", "vulkaninfo")
return VendorVulkan, nil
}
// Check for Apple Silicon (macOS)
if appleGPUs := getAppleGPUMemory(); len(appleGPUs) > 0 {
xlog.Debug("GPU vendor detected via system_profiler", "vendor", VendorApple)
return VendorApple, nil
}
// No vendor detected
return "", nil
}
// isUnifiedMemoryDevice checks if the given GPU name matches any known unified memory device
func isUnifiedMemoryDevice(gpuName string) bool {
gpuNameUpper := strings.ToUpper(gpuName)
for _, pattern := range UnifiedMemoryDevices {
if strings.Contains(gpuNameUpper, strings.ToUpper(pattern)) {
return true
}
}
return false
}
// GetGPUMemoryUsage returns real-time GPU memory usage for all detected GPUs.
// It tries multiple vendor-specific tools in order: NVIDIA, AMD, Intel, Vulkan.
// Returns an empty slice if no GPU monitoring tools are available.
func GetGPUMemoryUsage() []GPUMemoryInfo {
var gpus []GPUMemoryInfo
// Try NVIDIA first
nvidiaGPUs := getNVIDIAGPUMemory()
if len(nvidiaGPUs) > 0 {
gpus = append(gpus, nvidiaGPUs...)
}
// XXX: Note - I could not test this with AMD and Intel GPUs, so I'm not sure if it works and it was added with the help of AI.
// Try AMD ROCm
amdGPUs := getAMDGPUMemory()
if len(amdGPUs) > 0 {
// Adjust indices to continue from NVIDIA GPUs
startIdx := len(gpus)
for i := range amdGPUs {
amdGPUs[i].Index = startIdx + i
}
gpus = append(gpus, amdGPUs...)
}
// Try Intel
intelGPUs := getIntelGPUMemory()
if len(intelGPUs) > 0 {
startIdx := len(gpus)
for i := range intelGPUs {
intelGPUs[i].Index = startIdx + i
}
gpus = append(gpus, intelGPUs...)
}
// Try NVIDIA integrated GPUs (Tegra Jetson, DGX Spark, Thor — unified memory).
// These either lack nvidia-smi or have it behave unreliably, so we detect
// them via SoC sysfs and report system RAM figures.
if len(gpus) == 0 {
integratedGPUs := getNVIDIAIntegratedGPUMemory()
gpus = append(gpus, integratedGPUs...)
}
// Try Vulkan as fallback for device detection (limited real-time data)
if len(gpus) == 0 {
vulkanGPUs := getVulkanGPUMemory()
gpus = append(gpus, vulkanGPUs...)
}
// Try Apple Silicon (macOS only)
if len(gpus) == 0 {
appleGPUs := getAppleGPUMemory()
gpus = append(gpus, appleGPUs...)
}
return gpus
}
// GetGPUAggregateInfo returns aggregate GPU information across all GPUs
func GetGPUAggregateInfo() GPUAggregateInfo {
gpus := GetGPUMemoryUsage()
var aggregate GPUAggregateInfo
aggregate.GPUCount = len(gpus)
for _, gpu := range gpus {
aggregate.TotalVRAM += gpu.TotalVRAM
aggregate.UsedVRAM += gpu.UsedVRAM
aggregate.FreeVRAM += gpu.FreeVRAM
}
if aggregate.TotalVRAM > 0 {
aggregate.UsagePercent = float64(aggregate.UsedVRAM) / float64(aggregate.TotalVRAM) * 100
}
return aggregate
}
// getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi
func getNVIDIAGPUMemory() []GPUMemoryInfo {
// Check if nvidia-smi is available
if _, err := exec.LookPath("nvidia-smi"); err != nil {
return nil
}
cmd := exec.Command("nvidia-smi",
"--query-gpu=index,name,memory.total,memory.used,memory.free",
"--format=csv,noheader,nounits")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
xlog.Debug("nvidia-smi failed", "error", err, "stderr", stderr.String())
return nil
}
var gpus []GPUMemoryInfo
lines := strings.Split(strings.TrimSpace(stdout.String()), "\n")
for _, line := range lines {
if line == "" {
continue
}
parts := strings.Split(line, ", ")
if len(parts) < 5 {
continue
}
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
name := strings.TrimSpace(parts[1])
totalStr := strings.TrimSpace(parts[2])
usedStr := strings.TrimSpace(parts[3])
freeStr := strings.TrimSpace(parts[4])
var totalBytes, usedBytes, freeBytes uint64
var usagePercent float64
// Check if memory values are N/A (unified memory devices like GB10)
isNA := totalStr == "[N/A]" || usedStr == "[N/A]" || freeStr == "[N/A]"
if isNA && isUnifiedMemoryDevice(name) {
// Unified memory device - fall back to system RAM
sysInfo, err := GetSystemRAMInfo()
if err != nil {
xlog.Debug("failed to get system RAM for unified memory device", "error", err, "device", name)
// Still add the GPU but with zero memory info
gpus = append(gpus, GPUMemoryInfo{
Index: idx,
Name: name,
Vendor: VendorNVIDIA,
TotalVRAM: 0,
UsedVRAM: 0,
FreeVRAM: 0,
UsagePercent: 0,
})
continue
}
totalBytes = sysInfo.Total
usedBytes = sysInfo.Used
freeBytes = sysInfo.Free
if totalBytes > 0 {
usagePercent = float64(usedBytes) / float64(totalBytes) * 100
}
} else if isNA {
// Check if this is an NVIDIA integrated / unified-memory SoC — if so,
// fall back to system RAM (covers Jetson, DGX Spark/GB10, Thor).
if isNVIDIAIntegratedGPU() {
xlog.Debug("nvidia-smi returned N/A on NVIDIA integrated GPU, using system RAM", "device", name)
sysInfo, err := GetSystemRAMInfo()
if err != nil {
xlog.Debug("failed to get system RAM for NVIDIA integrated GPU", "error", err, "device", name)
gpus = append(gpus, GPUMemoryInfo{
Index: idx,
Name: name,
Vendor: VendorNVIDIA,
TotalVRAM: 0,
UsedVRAM: 0,
FreeVRAM: 0,
UsagePercent: 0,
})
continue
}
totalBytes = sysInfo.Total
usedBytes = sysInfo.Used
freeBytes = sysInfo.Free
if totalBytes > 0 {
usagePercent = float64(usedBytes) / float64(totalBytes) * 100
}
} else {
// Truly unknown device with N/A values - skip memory info
xlog.Debug("nvidia-smi returned N/A for unknown device", "device", name)
gpus = append(gpus, GPUMemoryInfo{
Index: idx,
Name: name,
Vendor: VendorNVIDIA,
TotalVRAM: 0,
UsedVRAM: 0,
FreeVRAM: 0,
UsagePercent: 0,
})
continue
}
} else {
// Normal GPU with dedicated VRAM
totalMB, _ := strconv.ParseFloat(totalStr, 64)
usedMB, _ := strconv.ParseFloat(usedStr, 64)
freeMB, _ := strconv.ParseFloat(freeStr, 64)
// Convert MB to bytes
totalBytes = uint64(totalMB * 1024 * 1024)
usedBytes = uint64(usedMB * 1024 * 1024)
freeBytes = uint64(freeMB * 1024 * 1024)
if totalBytes > 0 {
usagePercent = float64(usedBytes) / float64(totalBytes) * 100
}
}
gpus = append(gpus, GPUMemoryInfo{
Index: idx,
Name: name,
Vendor: VendorNVIDIA,
TotalVRAM: totalBytes,
UsedVRAM: usedBytes,
FreeVRAM: freeBytes,
UsagePercent: usagePercent,
})
}
return gpus
}
// getAMDGPUMemory queries AMD GPUs using rocm-smi
func getAMDGPUMemory() []GPUMemoryInfo {
// Check if rocm-smi is available
if _, err := exec.LookPath("rocm-smi"); err != nil {
return nil
}
// Try CSV format first
cmd := exec.Command("rocm-smi", "--showmeminfo", "vram", "--csv")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
xlog.Debug("rocm-smi failed", "error", err, "stderr", stderr.String())
return nil
}
var gpus []GPUMemoryInfo
lines := strings.Split(strings.TrimSpace(stdout.String()), "\n")
// Skip header line
for i, line := range lines {
if i == 0 || line == "" {
continue
}
parts := strings.Split(line, ",")
if len(parts) < 3 {
continue
}
// Parse GPU index from first column (usually "GPU[0]" format)
idxStr := strings.TrimSpace(parts[0])
idx := 0
if strings.HasPrefix(idxStr, "GPU[") {
idxStr = strings.TrimPrefix(idxStr, "GPU[")
idxStr = strings.TrimSuffix(idxStr, "]")
idx, _ = strconv.Atoi(idxStr)
}
// Parse memory values (in bytes or MB depending on rocm-smi version)
usedBytes, _ := strconv.ParseUint(strings.TrimSpace(parts[2]), 10, 64)
totalBytes, _ := strconv.ParseUint(strings.TrimSpace(parts[1]), 10, 64)
// If values seem like MB, convert to bytes
if totalBytes < 1000000 {
usedBytes *= 1024 * 1024
totalBytes *= 1024 * 1024
}
freeBytes := uint64(0)
if totalBytes > usedBytes {
freeBytes = totalBytes - usedBytes
}
usagePercent := 0.0
if totalBytes > 0 {
usagePercent = float64(usedBytes) / float64(totalBytes) * 100
}
gpus = append(gpus, GPUMemoryInfo{
Index: idx,
Name: "AMD GPU",
Vendor: VendorAMD,
TotalVRAM: totalBytes,
UsedVRAM: usedBytes,
FreeVRAM: freeBytes,
UsagePercent: usagePercent,
})
}
return gpus
}
// getIntelGPUMemory queries Intel GPUs using xpu-smi or intel_gpu_top
func getIntelGPUMemory() []GPUMemoryInfo {
// Try xpu-smi first (Intel's official GPU management tool)
gpus := getIntelXPUSMI()
if len(gpus) > 0 {
return gpus
}
// Fallback to intel_gpu_top
return getIntelGPUTop()
}
// getIntelXPUSMI queries Intel GPUs using xpu-smi
func getIntelXPUSMI() []GPUMemoryInfo {
if _, err := exec.LookPath("xpu-smi"); err != nil {
return nil
}
// Get device list
cmd := exec.Command("xpu-smi", "discovery", "--json")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
xlog.Debug("xpu-smi discovery failed", "error", err, "stderr", stderr.String())
return nil
}
// Parse JSON output
var result struct {
DeviceList []struct {
DeviceID int `json:"device_id"`
DeviceName string `json:"device_name"`
VendorName string `json:"vendor_name"`
MemoryPhysicalSizeBytes uint64 `json:"memory_physical_size_byte"`
} `json:"device_list"`
}
if err := json.Unmarshal(stdout.Bytes(), &result); err != nil {
xlog.Debug("failed to parse xpu-smi discovery output", "error", err)
return nil
}
var gpus []GPUMemoryInfo
for _, device := range result.DeviceList {
// Get memory usage for this device
statsCmd := exec.Command("xpu-smi", "stats", "-d", strconv.Itoa(device.DeviceID), "--json")
var statsStdout bytes.Buffer
statsCmd.Stdout = &statsStdout
usedBytes := uint64(0)
if err := statsCmd.Run(); err == nil {
var stats struct {
DeviceID int `json:"device_id"`
MemoryUsed uint64 `json:"memory_used"`
}
if err := json.Unmarshal(statsStdout.Bytes(), &stats); err == nil {
usedBytes = stats.MemoryUsed
}
}
totalBytes := device.MemoryPhysicalSizeBytes
freeBytes := uint64(0)
if totalBytes > usedBytes {
freeBytes = totalBytes - usedBytes
}
usagePercent := 0.0
if totalBytes > 0 {
usagePercent = float64(usedBytes) / float64(totalBytes) * 100
}
gpus = append(gpus, GPUMemoryInfo{
Index: device.DeviceID,
Name: device.DeviceName,
Vendor: VendorIntel,
TotalVRAM: totalBytes,
UsedVRAM: usedBytes,
FreeVRAM: freeBytes,
UsagePercent: usagePercent,
})
}
return gpus
}
// getIntelGPUTop queries Intel GPUs using intel_gpu_top
func getIntelGPUTop() []GPUMemoryInfo {
if _, err := exec.LookPath("intel_gpu_top"); err != nil {
return nil
}
// intel_gpu_top with -J outputs JSON, -s 1 for single sample
cmd := exec.Command("intel_gpu_top", "-J", "-s", "1")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
xlog.Debug("intel_gpu_top failed", "error", err, "stderr", stderr.String(), "stdout", stdout.String())
return nil
}
// Parse JSON output - intel_gpu_top outputs NDJSON
lines := strings.Split(strings.TrimSpace(stdout.String()), "\n")
if len(lines) == 0 {
return nil
}
// Take the last complete JSON object
var lastJSON string
for i := len(lines) - 1; i >= 0; i-- {
if strings.HasPrefix(strings.TrimSpace(lines[i]), "{") {
lastJSON = lines[i]
break
}
}
if lastJSON == "" {
return nil
}
var result struct {
Engines map[string]any `json:"engines"`
// Memory info if available
}
if err := json.Unmarshal([]byte(lastJSON), &result); err != nil {
xlog.Debug("failed to parse intel_gpu_top output", "error", err)
return nil
}
// intel_gpu_top doesn't always provide memory info
// Return empty if we can't get useful data
return nil
}
// isNVIDIAIntegratedGPU reports whether the host is an NVIDIA SoC with an
// integrated GPU that shares system RAM (unified memory). Covers the Jetson
// Tegra family (Orin, Xavier, Nano, AGX Thor) and SBSA-style NVIDIA SoCs such
// as the DGX Spark (GB10). nvidia-smi may be absent or unreliable on these
// hosts (notably when running under docker without NVML capability), so we
// detect via sysfs. Works both on the host and inside containers that mount
// /sys normally.
func isNVIDIAIntegratedGPU() bool {
if data, err := os.ReadFile("/sys/devices/soc0/family"); err == nil {
if strings.TrimSpace(string(data)) == "Tegra" {
return true
}
}
if data, err := os.ReadFile("/sys/devices/soc0/soc_id"); err == nil {
// JEDEC manufacturer 0x0426 = NVIDIA ("jep106:0426[:<soc>]").
if strings.HasPrefix(strings.TrimSpace(string(data)), "jep106:0426") {
return true
}
}
return false
}
// nvidiaIntegratedGPUName derives a human-readable device name for an NVIDIA
// unified-memory SoC without relying on nvidia-smi. Priority: device-tree
// model (populated on Jetson) → soc0/machine (some Jetson devkits) → soc_id
// lookup (SBSA SoCs expose JEDEC IDs) → generic fallbacks.
func nvidiaIntegratedGPUName() string {
if data, err := os.ReadFile("/proc/device-tree/model"); err == nil {
if s := strings.TrimRight(string(data), "\x00 \n"); s != "" {
return s
}
}
if data, err := os.ReadFile("/sys/devices/soc0/machine"); err == nil {
if s := strings.TrimSpace(string(data)); s != "" {
return s
}
}
if data, err := os.ReadFile("/sys/devices/soc0/soc_id"); err == nil {
s := strings.TrimSpace(string(data))
switch {
case strings.HasPrefix(s, "jep106:0426:8901"):
return "NVIDIA GB10"
case strings.HasPrefix(s, "jep106:0426"):
return "NVIDIA iGPU"
}
}
if data, err := os.ReadFile("/sys/devices/soc0/family"); err == nil {
if strings.TrimSpace(string(data)) == "Tegra" {
return "NVIDIA Jetson"
}
}
return "NVIDIA iGPU"
}
// getNVIDIAIntegratedGPUMemory detects NVIDIA unified-memory integrated GPUs
// (Jetson, DGX Spark/GB10, Thor) and reports system RAM figures as VRAM.
// Used as a fallback when nvidia-smi is missing or failing.
func getNVIDIAIntegratedGPUMemory() []GPUMemoryInfo {
if !isNVIDIAIntegratedGPU() {
return nil
}
name := nvidiaIntegratedGPUName()
ramInfo, err := GetSystemRAMInfo()
if err != nil {
xlog.Debug("NVIDIA integrated GPU detected but failed to get system RAM", "error", err, "device", name)
return []GPUMemoryInfo{{
Index: 0,
Name: name,
Vendor: VendorNVIDIA,
}}
}
usagePercent := 0.0
if ramInfo.Total > 0 {
usagePercent = float64(ramInfo.Used) / float64(ramInfo.Total) * 100
}
xlog.Debug("NVIDIA integrated GPU detected (unified memory)", "device", name, "total_ram", ramInfo.Total)
return []GPUMemoryInfo{{
Index: 0,
Name: name,
Vendor: VendorNVIDIA,
TotalVRAM: ramInfo.Total,
UsedVRAM: ramInfo.Used,
FreeVRAM: ramInfo.Free,
UsagePercent: usagePercent,
}}
}
// GetResourceInfo returns GPU info if available, otherwise system RAM info
func GetResourceInfo() ResourceInfo {
gpus := GetGPUMemoryUsage()
if len(gpus) > 0 {
// GPU available - return GPU info
aggregate := GetGPUAggregateInfo()
return ResourceInfo{
Type: "gpu",
Available: true,
GPUs: gpus,
RAM: nil,
Aggregate: AggregateMemoryInfo{
TotalMemory: aggregate.TotalVRAM,
UsedMemory: aggregate.UsedVRAM,
FreeMemory: aggregate.FreeVRAM,
UsagePercent: aggregate.UsagePercent,
GPUCount: aggregate.GPUCount,
},
}
}
// No GPU - fall back to system RAM
ramInfo, err := GetSystemRAMInfo()
if err != nil {
xlog.Debug("failed to get system RAM info", "error", err)
return ResourceInfo{
Type: "ram",
Available: false,
Aggregate: AggregateMemoryInfo{},
}
}
return ResourceInfo{
Type: "ram",
Available: true,
GPUs: nil,
RAM: ramInfo,
Aggregate: AggregateMemoryInfo{
TotalMemory: ramInfo.Total,
UsedMemory: ramInfo.Used,
FreeMemory: ramInfo.Free,
UsagePercent: ramInfo.UsagePercent,
GPUCount: 0,
},
}
}
// GetResourceAggregateInfo returns aggregate memory info (GPU if available, otherwise RAM)
// This is used by the memory reclaimer to check memory usage
func GetResourceAggregateInfo() AggregateMemoryInfo {
resourceInfo := GetResourceInfo()
return resourceInfo.Aggregate
}
// getVulkanGPUMemory queries GPUs using vulkaninfo as a fallback
// Note: Vulkan provides memory heap info but not real-time usage
func getVulkanGPUMemory() []GPUMemoryInfo {
if _, err := exec.LookPath("vulkaninfo"); err != nil {
return nil
}
cmd := exec.Command("vulkaninfo", "--json")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
xlog.Debug("vulkaninfo failed", "error", err, "stderr", stderr.String())
return nil
}
// Parse Vulkan JSON output
var result struct {
VkPhysicalDevices []struct {
DeviceName string `json:"deviceName"`
DeviceType string `json:"deviceType"`
VkPhysicalDeviceMemoryProperties struct {
MemoryHeaps []struct {
Flags int `json:"flags"`
Size uint64 `json:"size"`
} `json:"memoryHeaps"`
} `json:"VkPhysicalDeviceMemoryProperties"`
} `json:"VkPhysicalDevices"`
}
if err := json.Unmarshal(stdout.Bytes(), &result); err != nil {
xlog.Debug("failed to parse vulkaninfo output", "error", err)
return nil
}
var gpus []GPUMemoryInfo
for i, device := range result.VkPhysicalDevices {
// Skip non-discrete/integrated GPUs if possible
if device.DeviceType == "VK_PHYSICAL_DEVICE_TYPE_CPU" {
continue
}
// Sum up device-local memory heaps
var totalVRAM uint64
for _, heap := range device.VkPhysicalDeviceMemoryProperties.MemoryHeaps {
// Flag 1 = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT
if heap.Flags&1 != 0 {
totalVRAM += heap.Size
}
}
if totalVRAM == 0 {
continue
}
gpus = append(gpus, GPUMemoryInfo{
Index: i,
Name: device.DeviceName,
Vendor: VendorVulkan,
TotalVRAM: totalVRAM,
UsedVRAM: 0, // Vulkan doesn't provide real-time usage
FreeVRAM: totalVRAM,
UsagePercent: 0,
})
}
return gpus
}
// getAppleGPUMemory detects Apple Silicon GPUs using system_profiler (macOS only).
// Apple Silicon uses unified memory, so GPU memory is reported as system RAM.
func getAppleGPUMemory() []GPUMemoryInfo {
if _, err := exec.LookPath("system_profiler"); err != nil {
return nil
}
cmd := exec.Command("system_profiler", "SPDisplaysDataType", "-json")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
xlog.Debug("system_profiler failed", "error", err, "stderr", stderr.String())
return nil
}
var result struct {
SPDisplaysDataType []struct {
Name string `json:"_name"`
Model string `json:"sppci_model"`
Cores string `json:"sppci_cores"`
DeviceType string `json:"sppci_device_type"`
Vendor string `json:"spdisplays_vendor"`
} `json:"SPDisplaysDataType"`
}
if err := json.Unmarshal(stdout.Bytes(), &result); err != nil {
xlog.Debug("failed to parse system_profiler output", "error", err)
return nil
}
var gpus []GPUMemoryInfo
for i, display := range result.SPDisplaysDataType {
if display.DeviceType != "spdisplays_gpu" {
continue
}
if !strings.Contains(strings.ToLower(display.Vendor), "apple") {
continue
}
name := display.Model
if name == "" {
name = display.Name
}
if name == "" {
name = "Apple GPU"
}
// Apple Silicon uses unified memory — report system RAM
ramInfo, err := GetSystemRAMInfo()
if err != nil {
xlog.Debug("Apple GPU detected but failed to get system RAM", "error", err)
gpus = append(gpus, GPUMemoryInfo{
Index: i,
Name: name,
Vendor: VendorApple,
})
continue
}
usagePercent := 0.0
if ramInfo.Total > 0 {
usagePercent = float64(ramInfo.Used) / float64(ramInfo.Total) * 100
}
xlog.Debug("Apple Silicon GPU detected (unified memory)", "device", name, "total_ram", ramInfo.Total)
gpus = append(gpus, GPUMemoryInfo{
Index: i,
Name: name,
Vendor: VendorApple,
TotalVRAM: ramInfo.Total,
UsedVRAM: ramInfo.Used,
FreeVRAM: ramInfo.Free,
UsagePercent: usagePercent,
})
}
return gpus
}