Files
LocalAI/pkg/xsysinfo/drmfdinfo.go
Richard Palethorpe 90ea327178 fix(intel): VRAM detection (#9944)
* fix(gpu-detect): clinfo --json fallback for Intel discrete VRAM

ghw returns 0 VRAM for any i915-driven Intel GPU because the kernel
driver doesn't expose VRAM through the sysfs paths ghw checks (no
mem_info_vram_total — that's an amdgpu interface). xpu-smi, the
canonical Intel tool, isn't in the oneAPI base image (it lives in a
separate xpumanager package). The capability gate added in 19c92c70
("default to CPU if there is less than 4GB of GPU available") then
demotes the host to CPU even on a 16 GB Arc A770.

clinfo ships with the OpenCL ICD loader and is present in the oneAPI
base image, so plug it in as the last-resort Intel VRAM source:

  xpu-smi -> intel_gpu_top -> clinfo --json

The parser drops UMA devices via HOST_UNIFIED_MEMORY=true so an iGPU
sibling can't double-count system RAM, and dedups by PCI BDF when
multiple ICDs enumerate the same physical device (POCL caps reported
GLOBAL_MEM_SIZE at 4 GiB; the largest non-capped value wins).

Subprocess is wrapped in a 2s timeout and memoised with sync.OnceValue
— GPU hardware is static for the process lifetime. The Intel branch
also short-circuits when ghw saw no Intel vendor, so NVIDIA-only hosts
don't pay the spawn cost.

Verified end-to-end on Intel Arc A770: ghw -> 0, clinfo path reports
16,225,243,136 bytes (15.11 GiB), capability gate now passes naturally
without LOCALAI_FORCE_META_BACKEND_CAPABILITY=intel.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: Richard Palethorpe <io@richiejp.com>

* feat(gpu-detect): live VRAM usage from DRM fdinfo

The clinfo fallback reports total VRAM correctly but leaves UsedVRAM
at 0 because OpenCL has no portable live-memory property — the UI
ends up showing 0% utilisation even when llama-cpp is actually
holding gigabytes in device memory.

Fill that gap with the standardised Linux DRM fdinfo interface
(Documentation/gpu/drm-usage-stats.rst, kernel ≥5.19). Walking
/proc/<pid>/fdinfo for any fd that points at /dev/dri/render* yields
drm-total-<region> / drm-resident-<region> keys; aggregate per
render-node, resolve the render node to a PCI BDF via
/sys/class/drm/<name>/device, and merge the result into the matching
GPUMemoryInfo by BDF.

Region naming is driver-defined — i915 uses "local0" for device-local
VRAM, amdgpu and xe use "vram0" — so a prefix-match on local/vram
covers all three DRM drivers that LocalAI cares about. system/gtt/
stolen regions are deliberately excluded since they're host RAM
mirrors and would double-count against system RAM.

GPUMemoryInfo gains an optional BDF field (`bdf,omitempty` in JSON)
so future vendor-specific detectors can plug into the same matcher.
Empty BDF skips the merge — non-PCI devices and detection paths that
don't surface PCI location keep their existing behaviour.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 09:29:00 +02:00

148 lines
3.9 KiB
Go

package xsysinfo
import (
"bufio"
"bytes"
"os"
"path/filepath"
"strconv"
"strings"
)
// drmFdInfoUsageByBDF walks /proc/<pid>/fdinfo/<fd> for every fd that
// points at /dev/dri/render* and aggregates per-GPU VRAM allocations.
// Keyed by the PCI BDF (dddd:bb:dd.f) of the render node so callers
// can match against any GPU detection result.
//
// The kernel exposes per-process DRM accounting via standardised
// fdinfo keys (Documentation/gpu/drm-usage-stats.rst, kernel ≥5.19):
//
// drm-total-<region>: bytes the process has bound to <region>
// drm-resident-<region>: bytes currently resident in <region>
//
// Region names are driver-defined: i915 uses "local*" for device-local
// VRAM, amdgpu and xe use "vram*". We sum any region whose name
// starts with "local" or "vram"; "system*" / "gtt*" / "stolen-*" are
// excluded since they're host RAM mirrors.
//
// Returns an empty map when no process holds a DRM render fd or the
// kernel doesn't emit the accounting keys (older kernels, exotic
// drivers). The walker is read-only and survives unreadable proc
// entries (other users' processes, transient PIDs).
func drmFdInfoUsageByBDF() map[string]uint64 {
byRender := drmFdInfoUsageByRenderNode()
if len(byRender) == 0 {
return nil
}
out := make(map[string]uint64, len(byRender))
for name, used := range byRender {
bdf := renderNodeBDF(name)
if bdf == "" {
continue
}
out[bdf] += used
}
return out
}
func drmFdInfoUsageByRenderNode() map[string]uint64 {
procs, _ := filepath.Glob("/proc/[0-9]*/fd")
if len(procs) == 0 {
return nil
}
out := map[string]uint64{}
for _, fdDir := range procs {
pidDir := filepath.Dir(fdDir)
entries, err := os.ReadDir(fdDir)
if err != nil {
// /proc race: process exited or unreadable. Skip silently.
continue
}
for _, entry := range entries {
target, err := os.Readlink(filepath.Join(fdDir, entry.Name()))
if err != nil {
continue
}
const renderPrefix = "/dev/dri/render"
if !strings.HasPrefix(target, renderPrefix) {
continue
}
renderName := strings.TrimPrefix(target, "/dev/dri/")
data, err := os.ReadFile(filepath.Join(pidDir, "fdinfo", entry.Name()))
if err != nil {
continue
}
out[renderName] += parseDRMFdInfoVRAM(data)
}
}
return out
}
// parseDRMFdInfoVRAM sums `drm-total-<region>` bytes across all VRAM
// regions in a single fdinfo blob. Values are formatted as
// "<number> <KiB|MiB|GiB>" or bare bytes; both are accepted.
func parseDRMFdInfoVRAM(data []byte) uint64 {
var total uint64
sc := bufio.NewScanner(bytes.NewReader(data))
for sc.Scan() {
line := sc.Text()
const prefix = "drm-total-"
if !strings.HasPrefix(line, prefix) {
continue
}
key, value, ok := strings.Cut(line, ":")
if !ok {
continue
}
region := strings.TrimPrefix(key, prefix)
if !isVRAMRegion(region) {
continue
}
total += parseDRMFdInfoBytes(value)
}
return total
}
func isVRAMRegion(region string) bool {
return strings.HasPrefix(region, "local") || strings.HasPrefix(region, "vram")
}
func parseDRMFdInfoBytes(value string) uint64 {
fields := strings.Fields(value)
if len(fields) == 0 {
return 0
}
n, err := strconv.ParseUint(fields[0], 10, 64)
if err != nil {
return 0
}
if len(fields) < 2 {
return n
}
switch strings.ToLower(fields[1]) {
case "kib":
return n * 1024
case "mib":
return n * 1024 * 1024
case "gib":
return n * 1024 * 1024 * 1024
}
return n
}
// renderNodeBDF resolves a DRM render-node basename (e.g. "renderD129")
// to its underlying PCI BDF by following /sys/class/drm/<name>/device.
// Returns "" for non-PCI devices or symlink read errors.
func renderNodeBDF(name string) string {
link, err := os.Readlink("/sys/class/drm/" + name + "/device")
if err != nil {
return ""
}
base := filepath.Base(link)
// Sanity-check: BDF format is dddd:bb:dd.f
if strings.Count(base, ":") != 2 || strings.Count(base, ".") != 1 {
return ""
}
return strings.ToLower(base)
}