mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-29 11:07:18 -04:00
fix(intel): VRAM detection (#9944)
* fix(gpu-detect): clinfo --json fallback for Intel discrete VRAM
ghw returns 0 VRAM for any i915-driven Intel GPU because the kernel
driver doesn't expose VRAM through the sysfs paths ghw checks (no
mem_info_vram_total — that's an amdgpu interface). xpu-smi, the
canonical Intel tool, isn't in the oneAPI base image (it lives in a
separate xpumanager package). The capability gate added in 19c92c70
("default to CPU if there is less than 4GB of GPU available") then
demotes the host to CPU even on a 16 GB Arc A770.
clinfo ships with the OpenCL ICD loader and is present in the oneAPI
base image, so plug it in as the last-resort Intel VRAM source:
xpu-smi -> intel_gpu_top -> clinfo --json
The parser drops UMA devices via HOST_UNIFIED_MEMORY=true so an iGPU
sibling can't double-count system RAM, and dedups by PCI BDF when
multiple ICDs enumerate the same physical device (POCL caps reported
GLOBAL_MEM_SIZE at 4 GiB; the largest non-capped value wins).
Subprocess is wrapped in a 2s timeout and memoised with sync.OnceValue
— GPU hardware is static for the process lifetime. The Intel branch
also short-circuits when ghw saw no Intel vendor, so NVIDIA-only hosts
don't pay the spawn cost.
Verified end-to-end on Intel Arc A770: ghw -> 0, clinfo path reports
16,225,243,136 bytes (15.11 GiB), capability gate now passes naturally
without LOCALAI_FORCE_META_BACKEND_CAPABILITY=intel.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: Richard Palethorpe <io@richiejp.com>
* feat(gpu-detect): live VRAM usage from DRM fdinfo
The clinfo fallback reports total VRAM correctly but leaves UsedVRAM
at 0 because OpenCL has no portable live-memory property — the UI
ends up showing 0% utilisation even when llama-cpp is actually
holding gigabytes in device memory.
Fill that gap with the standardised Linux DRM fdinfo interface
(Documentation/gpu/drm-usage-stats.rst, kernel ≥5.19). Walking
/proc/<pid>/fdinfo for any fd that points at /dev/dri/render* yields
drm-total-<region> / drm-resident-<region> keys; aggregate per
render-node, resolve the render node to a PCI BDF via
/sys/class/drm/<name>/device, and merge the result into the matching
GPUMemoryInfo by BDF.
Region naming is driver-defined — i915 uses "local0" for device-local
VRAM, amdgpu and xe use "vram0" — so a prefix-match on local/vram
covers all three DRM drivers that LocalAI cares about. system/gtt/
stolen regions are deliberately excluded since they're host RAM
mirrors and would double-count against system RAM.
GPUMemoryInfo gains an optional BDF field (`bdf,omitempty` in JSON)
so future vendor-specific detectors can plug into the same matcher.
Empty BDF skips the merge — non-PCI devices and detection paths that
don't surface PCI location keep their existing behaviour.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: Richard Palethorpe <io@richiejp.com>
---------
Signed-off-by: Richard Palethorpe <io@richiejp.com>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
6a80e23733
commit
90ea327178
221
pkg/xsysinfo/clinfo.go
Normal file
221
pkg/xsysinfo/clinfo.go
Normal file
@@ -0,0 +1,221 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
const (
|
||||
clDeviceTypeGPU = "CL_DEVICE_TYPE_GPU"
|
||||
clinfoTimeout = 2 * time.Second
|
||||
)
|
||||
|
||||
// clinfoOutput is the subset of `clinfo --json` we read. clinfo emits
|
||||
// one entry under "devices" per platform, in the same order as
|
||||
// "platforms"; live devices are under "online".
|
||||
type clinfoOutput struct {
|
||||
Devices []struct {
|
||||
Online []clinfoDevice `json:"online"`
|
||||
} `json:"devices"`
|
||||
}
|
||||
|
||||
type clinfoDevice struct {
|
||||
Name string `json:"CL_DEVICE_NAME"`
|
||||
Vendor string `json:"CL_DEVICE_VENDOR"`
|
||||
VendorID uint32 `json:"CL_DEVICE_VENDOR_ID"`
|
||||
Type clinfoTypeProp `json:"CL_DEVICE_TYPE"`
|
||||
HostUnifiedMemory bool `json:"CL_DEVICE_HOST_UNIFIED_MEMORY"`
|
||||
GlobalMemSize uint64 `json:"CL_DEVICE_GLOBAL_MEM_SIZE"`
|
||||
PCIBusInfoKHR string `json:"CL_DEVICE_PCI_BUS_INFO_KHR"`
|
||||
PCIDomainNV int `json:"CL_DEVICE_PCI_DOMAIN_ID_NV"`
|
||||
PCIBusNV int `json:"CL_DEVICE_PCI_BUS_ID_NV"`
|
||||
PCISlotNV int `json:"CL_DEVICE_PCI_SLOT_ID_NV"`
|
||||
}
|
||||
|
||||
// clinfoTypeProp matches against the type-string array rather than
|
||||
// CL_DEVICE_TYPE.raw so a future CL_DEVICE_TYPE_CUSTOM can't sneak
|
||||
// past as a GPU.
|
||||
type clinfoTypeProp struct {
|
||||
Raw uint32 `json:"raw"`
|
||||
Type []string `json:"type"`
|
||||
}
|
||||
|
||||
func (t clinfoTypeProp) isGPU() bool {
|
||||
for _, s := range t.Type {
|
||||
if s == clDeviceTypeGPU {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// clinfoOnce caches the result for the process lifetime. GPU hardware
|
||||
// doesn't change between calls and the subprocess is ~150 ms.
|
||||
var clinfoOnce = sync.OnceValue(runCLInfo)
|
||||
|
||||
func runCLInfo() []GPUMemoryInfo {
|
||||
if _, err := exec.LookPath("clinfo"); err != nil {
|
||||
return nil
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), clinfoTimeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, "clinfo", "--json")
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
if err := cmd.Run(); err != nil {
|
||||
xlog.Debug("clinfo failed", "error", err, "stderr", stderr.String())
|
||||
return nil
|
||||
}
|
||||
return parseCLInfoJSON(stdout.Bytes())
|
||||
}
|
||||
|
||||
// getCLInfoGPUMemory is a best-effort fallback for hosts where the
|
||||
// vendor's own management binary (nvidia-smi / xpu-smi / rocm-smi)
|
||||
// isn't installed but the OpenCL ICD is. Live used/free aren't exposed
|
||||
// via standard CL_ properties; we synthesise them by attributing
|
||||
// per-process VRAM allocations from the kernel DRM fdinfo interface
|
||||
// to each clinfo-reported GPU via the shared PCI BDF.
|
||||
func getCLInfoGPUMemory() []GPUMemoryInfo {
|
||||
gpus := clinfoOnce()
|
||||
if len(gpus) == 0 {
|
||||
return nil
|
||||
}
|
||||
usage := drmFdInfoUsageByBDF()
|
||||
for i := range gpus {
|
||||
gpus[i] = applyDRMUsage(gpus[i], usage[gpus[i].BDF])
|
||||
}
|
||||
return gpus
|
||||
}
|
||||
|
||||
// applyDRMUsage stamps live VRAM accounting onto a GPUMemoryInfo
|
||||
// whose TotalVRAM came from a static source (e.g. clinfo). Caller
|
||||
// already populated TotalVRAM and FreeVRAM=TotalVRAM as defaults; if
|
||||
// DRM accounting reports usage, we trust it and rederive free/percent.
|
||||
func applyDRMUsage(g GPUMemoryInfo, used uint64) GPUMemoryInfo {
|
||||
if used == 0 || g.TotalVRAM == 0 {
|
||||
return g
|
||||
}
|
||||
if used > g.TotalVRAM {
|
||||
// Process-private DRM total can momentarily exceed device
|
||||
// VRAM (over-commit via host memory mirror). Clamp so the UI
|
||||
// doesn't display absurd percentages.
|
||||
used = g.TotalVRAM
|
||||
}
|
||||
g.UsedVRAM = used
|
||||
g.FreeVRAM = g.TotalVRAM - used
|
||||
g.UsagePercent = float64(used) / float64(g.TotalVRAM) * 100
|
||||
return g
|
||||
}
|
||||
|
||||
// parseCLInfoJSON returns one GPUMemoryInfo per discrete GPU. UMA
|
||||
// devices (iGPU/APU) are dropped because their "VRAM" is system RAM
|
||||
// and would double-count against the capability gate. When the same
|
||||
// physical device is enumerated by multiple ICDs (Intel OpenCL + POCL,
|
||||
// for example), the BDF dedup keeps the largest reported size — some
|
||||
// ICDs cap at 4 GiB for legacy alloc-size compatibility.
|
||||
func parseCLInfoJSON(raw []byte) []GPUMemoryInfo {
|
||||
var out clinfoOutput
|
||||
if err := json.Unmarshal(raw, &out); err != nil {
|
||||
xlog.Debug("clinfo: failed to parse --json output", "error", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
byBDF := map[string]GPUMemoryInfo{}
|
||||
var noBDF []GPUMemoryInfo
|
||||
|
||||
for _, plat := range out.Devices {
|
||||
for _, d := range plat.Online {
|
||||
if !d.Type.isGPU() || d.HostUnifiedMemory || d.GlobalMemSize == 0 {
|
||||
continue
|
||||
}
|
||||
bdf := clinfoBDF(d)
|
||||
info := GPUMemoryInfo{
|
||||
Name: strings.TrimSpace(d.Name),
|
||||
Vendor: clinfoVendor(d.VendorID, d.Vendor),
|
||||
BDF: bdf,
|
||||
TotalVRAM: d.GlobalMemSize,
|
||||
FreeVRAM: d.GlobalMemSize,
|
||||
}
|
||||
if bdf == "" {
|
||||
noBDF = append(noBDF, info)
|
||||
continue
|
||||
}
|
||||
if existing, ok := byBDF[bdf]; !ok || info.TotalVRAM > existing.TotalVRAM {
|
||||
byBDF[bdf] = info
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
all := make([]GPUMemoryInfo, 0, len(byBDF)+len(noBDF))
|
||||
for _, g := range byBDF {
|
||||
all = append(all, g)
|
||||
}
|
||||
all = append(all, noBDF...)
|
||||
for i := range all {
|
||||
all[i].Index = i
|
||||
}
|
||||
return all
|
||||
}
|
||||
|
||||
func clinfoVendor(vendorID uint32, name string) string {
|
||||
switch vendorID {
|
||||
case 0x10de:
|
||||
return VendorNVIDIA
|
||||
case 0x1002, 0x1022: // 0x1022 is the AMD CPU vendor ID, also reported by some APU OpenCL devices.
|
||||
return VendorAMD
|
||||
case 0x8086:
|
||||
return VendorIntel
|
||||
case 0x106B:
|
||||
return VendorApple
|
||||
}
|
||||
n := strings.ToLower(name)
|
||||
switch {
|
||||
case strings.Contains(n, "nvidia"):
|
||||
return VendorNVIDIA
|
||||
case strings.Contains(n, "advanced micro devices"), strings.Contains(n, "amd"):
|
||||
return VendorAMD
|
||||
case strings.Contains(n, "intel"):
|
||||
return VendorIntel
|
||||
case strings.Contains(n, "apple"):
|
||||
return VendorApple
|
||||
}
|
||||
return VendorUnknown
|
||||
}
|
||||
|
||||
// clinfoBDF returns the device's canonical `dddd:bb:dd.f` PCI address,
|
||||
// or "" when no PCI location is reported. The KHR form is `"PCI-E,
|
||||
// 0000:01:00.0"` on NVIDIA and bare `"0000:01:00.0"` on most others.
|
||||
func clinfoBDF(d clinfoDevice) string {
|
||||
if d.PCIBusInfoKHR != "" {
|
||||
s := d.PCIBusInfoKHR
|
||||
if i := strings.LastIndex(s, " "); i >= 0 {
|
||||
s = s[i+1:]
|
||||
}
|
||||
if c := strings.Count(s, ":"); c == 1 || c == 2 {
|
||||
return normalizeBDF(s)
|
||||
}
|
||||
}
|
||||
// NVIDIA pre-KHR per-axis fields. An all-zero result is
|
||||
// indistinguishable from "fields absent", but no GPU sits at
|
||||
// 0000:00:00.0 so the false negative is harmless.
|
||||
if d.PCIBusNV != 0 || d.PCISlotNV != 0 || d.PCIDomainNV != 0 {
|
||||
return fmt.Sprintf("%04x:%02x:%02x.0", d.PCIDomainNV, d.PCIBusNV, d.PCISlotNV)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func normalizeBDF(s string) string {
|
||||
if strings.Count(s, ":") == 1 {
|
||||
return strings.ToLower("0000:" + s)
|
||||
}
|
||||
return strings.ToLower(s)
|
||||
}
|
||||
191
pkg/xsysinfo/clinfo_test.go
Normal file
191
pkg/xsysinfo/clinfo_test.go
Normal file
@@ -0,0 +1,191 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
const nvidiaRTX5070TiJSON = `{
|
||||
"devices": [
|
||||
{
|
||||
"online": [
|
||||
{
|
||||
"CL_DEVICE_NAME": "NVIDIA GeForce RTX 5070 Ti",
|
||||
"CL_DEVICE_VENDOR": "NVIDIA Corporation",
|
||||
"CL_DEVICE_VENDOR_ID": 4318,
|
||||
"CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]},
|
||||
"CL_DEVICE_HOST_UNIFIED_MEMORY": false,
|
||||
"CL_DEVICE_GLOBAL_MEM_SIZE": 16609378304,
|
||||
"CL_DEVICE_PCI_BUS_INFO_KHR": "PCI-E, 0000:01:00.0",
|
||||
"CL_DEVICE_PCI_BUS_ID_NV": 1,
|
||||
"CL_DEVICE_PCI_SLOT_ID_NV": 0,
|
||||
"CL_DEVICE_PCI_DOMAIN_ID_NV": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}`
|
||||
|
||||
// intelArcPlusIGPUJSON exercises the HOST_UNIFIED_MEMORY=true filter:
|
||||
// the iGPU sibling on the same Intel platform must be dropped to
|
||||
// avoid double-counting system RAM as VRAM.
|
||||
const intelArcPlusIGPUJSON = `{
|
||||
"devices": [
|
||||
{
|
||||
"online": [
|
||||
{
|
||||
"CL_DEVICE_NAME": "Intel(R) Arc(TM) A770 Graphics",
|
||||
"CL_DEVICE_VENDOR": "Intel(R) Corporation",
|
||||
"CL_DEVICE_VENDOR_ID": 32902,
|
||||
"CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]},
|
||||
"CL_DEVICE_HOST_UNIFIED_MEMORY": false,
|
||||
"CL_DEVICE_GLOBAL_MEM_SIZE": 16225243136,
|
||||
"CL_DEVICE_PCI_BUS_INFO_KHR": "0000:03:00.0"
|
||||
},
|
||||
{
|
||||
"CL_DEVICE_NAME": "Intel(R) UHD Graphics 770",
|
||||
"CL_DEVICE_VENDOR": "Intel(R) Corporation",
|
||||
"CL_DEVICE_VENDOR_ID": 32902,
|
||||
"CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]},
|
||||
"CL_DEVICE_HOST_UNIFIED_MEMORY": true,
|
||||
"CL_DEVICE_GLOBAL_MEM_SIZE": 25000000000,
|
||||
"CL_DEVICE_PCI_BUS_INFO_KHR": "0000:00:02.0"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}`
|
||||
|
||||
// dualICDSameDeviceJSON exercises BDF dedup when two ICDs enumerate
|
||||
// the same physical device with different reported sizes (POCL caps
|
||||
// at 4 GiB for legacy alloc-size compatibility).
|
||||
const dualICDSameDeviceJSON = `{
|
||||
"devices": [
|
||||
{
|
||||
"online": [
|
||||
{
|
||||
"CL_DEVICE_NAME": "Intel(R) Arc(TM) A770 Graphics",
|
||||
"CL_DEVICE_VENDOR_ID": 32902,
|
||||
"CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]},
|
||||
"CL_DEVICE_HOST_UNIFIED_MEMORY": false,
|
||||
"CL_DEVICE_GLOBAL_MEM_SIZE": 16225243136,
|
||||
"CL_DEVICE_PCI_BUS_INFO_KHR": "0000:03:00.0"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"online": [
|
||||
{
|
||||
"CL_DEVICE_NAME": "pthread-Arc-A770",
|
||||
"CL_DEVICE_VENDOR_ID": 32902,
|
||||
"CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]},
|
||||
"CL_DEVICE_HOST_UNIFIED_MEMORY": false,
|
||||
"CL_DEVICE_GLOBAL_MEM_SIZE": 4294967296,
|
||||
"CL_DEVICE_PCI_BUS_INFO_KHR": "0000:03:00.0"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}`
|
||||
|
||||
// cpuOnlyJSON: a POCL-only host. Filtered by CL_DEVICE_TYPE — without
|
||||
// this guard CPU memory would be mistakenly reported as VRAM.
|
||||
const cpuOnlyJSON = `{
|
||||
"devices": [
|
||||
{
|
||||
"online": [
|
||||
{
|
||||
"CL_DEVICE_NAME": "pthread-x86_64",
|
||||
"CL_DEVICE_VENDOR": "GenuineIntel",
|
||||
"CL_DEVICE_VENDOR_ID": 32902,
|
||||
"CL_DEVICE_TYPE": {"raw": 2, "type": ["CL_DEVICE_TYPE_CPU"]},
|
||||
"CL_DEVICE_HOST_UNIFIED_MEMORY": true,
|
||||
"CL_DEVICE_GLOBAL_MEM_SIZE": 33324494848
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}`
|
||||
|
||||
// noBDFJSON: an ICD that reports no PCI fields at all. Device is
|
||||
// still counted but doesn't participate in dedup.
|
||||
const noBDFJSON = `{
|
||||
"devices": [
|
||||
{
|
||||
"online": [
|
||||
{
|
||||
"CL_DEVICE_NAME": "Some Accelerator GPU",
|
||||
"CL_DEVICE_VENDOR_ID": 0,
|
||||
"CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]},
|
||||
"CL_DEVICE_HOST_UNIFIED_MEMORY": false,
|
||||
"CL_DEVICE_GLOBAL_MEM_SIZE": 8589934592
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}`
|
||||
|
||||
var _ = Describe("parseCLInfoJSON", func() {
|
||||
DescribeTable("classifies and dedups clinfo devices",
|
||||
func(input string, wantCount int, want []GPUMemoryInfo) {
|
||||
got := parseCLInfoJSON([]byte(input))
|
||||
Expect(got).To(HaveLen(wantCount))
|
||||
for i, w := range want {
|
||||
Expect(got[i].Name).To(Equal(w.Name))
|
||||
Expect(got[i].Vendor).To(Equal(w.Vendor))
|
||||
Expect(got[i].TotalVRAM).To(Equal(w.TotalVRAM))
|
||||
}
|
||||
},
|
||||
Entry("empty object returns nothing", `{}`, 0, nil),
|
||||
Entry("malformed JSON returns nothing without panicking", `{not valid`, 0, nil),
|
||||
Entry("CPU-only platform is filtered out", cpuOnlyJSON, 0, nil),
|
||||
Entry("NVIDIA dGPU is recognised by vendor ID and BDF",
|
||||
nvidiaRTX5070TiJSON, 1, []GPUMemoryInfo{{
|
||||
Name: "NVIDIA GeForce RTX 5070 Ti",
|
||||
Vendor: VendorNVIDIA,
|
||||
TotalVRAM: 16609378304,
|
||||
}}),
|
||||
Entry("Intel Arc with iGPU sibling: iGPU dropped, Arc reported",
|
||||
intelArcPlusIGPUJSON, 1, []GPUMemoryInfo{{
|
||||
Name: "Intel(R) Arc(TM) A770 Graphics",
|
||||
Vendor: VendorIntel,
|
||||
TotalVRAM: 16225243136,
|
||||
}}),
|
||||
Entry("Dual ICD enumerating same Arc: deduped, larger size wins",
|
||||
dualICDSameDeviceJSON, 1, []GPUMemoryInfo{{
|
||||
Name: "Intel(R) Arc(TM) A770 Graphics",
|
||||
Vendor: VendorIntel,
|
||||
TotalVRAM: 16225243136, // not the POCL 4 GiB cap
|
||||
}}),
|
||||
Entry("Device without PCI info is still counted",
|
||||
noBDFJSON, 1, []GPUMemoryInfo{{
|
||||
Name: "Some Accelerator GPU",
|
||||
Vendor: VendorUnknown,
|
||||
TotalVRAM: 8589934592,
|
||||
}}),
|
||||
)
|
||||
})
|
||||
|
||||
var _ = Describe("normalizeBDF", func() {
|
||||
DescribeTable("canonicalises PCI bus addresses",
|
||||
func(in, want string) {
|
||||
Expect(normalizeBDF(in)).To(Equal(want))
|
||||
},
|
||||
Entry("already canonical", "0000:03:00.0", "0000:03:00.0"),
|
||||
Entry("missing domain", "03:00.0", "0000:03:00.0"),
|
||||
Entry("uppercase hex", "AB:CD.0", "0000:ab:cd.0"),
|
||||
)
|
||||
})
|
||||
|
||||
var _ = Describe("clinfoBDF", func() {
|
||||
It("synthesises a canonical BDF from NVIDIA pre-KHR integer fields", func() {
|
||||
// Older NVIDIA OpenCL exposes BDF via three integer fields instead
|
||||
// of the KHR string; the synthesised result must be canonical.
|
||||
d := clinfoDevice{
|
||||
PCIBusNV: 1,
|
||||
PCISlotNV: 0,
|
||||
PCIDomainNV: 0,
|
||||
}
|
||||
Expect(clinfoBDF(d)).To(Equal("0000:01:00.0"))
|
||||
})
|
||||
})
|
||||
147
pkg/xsysinfo/drmfdinfo.go
Normal file
147
pkg/xsysinfo/drmfdinfo.go
Normal file
@@ -0,0 +1,147 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// drmFdInfoUsageByBDF walks /proc/<pid>/fdinfo/<fd> for every fd that
|
||||
// points at /dev/dri/render* and aggregates per-GPU VRAM allocations.
|
||||
// Keyed by the PCI BDF (dddd:bb:dd.f) of the render node so callers
|
||||
// can match against any GPU detection result.
|
||||
//
|
||||
// The kernel exposes per-process DRM accounting via standardised
|
||||
// fdinfo keys (Documentation/gpu/drm-usage-stats.rst, kernel ≥5.19):
|
||||
//
|
||||
// drm-total-<region>: bytes the process has bound to <region>
|
||||
// drm-resident-<region>: bytes currently resident in <region>
|
||||
//
|
||||
// Region names are driver-defined: i915 uses "local*" for device-local
|
||||
// VRAM, amdgpu and xe use "vram*". We sum any region whose name
|
||||
// starts with "local" or "vram"; "system*" / "gtt*" / "stolen-*" are
|
||||
// excluded since they're host RAM mirrors.
|
||||
//
|
||||
// Returns an empty map when no process holds a DRM render fd or the
|
||||
// kernel doesn't emit the accounting keys (older kernels, exotic
|
||||
// drivers). The walker is read-only and survives unreadable proc
|
||||
// entries (other users' processes, transient PIDs).
|
||||
func drmFdInfoUsageByBDF() map[string]uint64 {
|
||||
byRender := drmFdInfoUsageByRenderNode()
|
||||
if len(byRender) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]uint64, len(byRender))
|
||||
for name, used := range byRender {
|
||||
bdf := renderNodeBDF(name)
|
||||
if bdf == "" {
|
||||
continue
|
||||
}
|
||||
out[bdf] += used
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func drmFdInfoUsageByRenderNode() map[string]uint64 {
|
||||
procs, _ := filepath.Glob("/proc/[0-9]*/fd")
|
||||
if len(procs) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := map[string]uint64{}
|
||||
for _, fdDir := range procs {
|
||||
pidDir := filepath.Dir(fdDir)
|
||||
entries, err := os.ReadDir(fdDir)
|
||||
if err != nil {
|
||||
// /proc race: process exited or unreadable. Skip silently.
|
||||
continue
|
||||
}
|
||||
for _, entry := range entries {
|
||||
target, err := os.Readlink(filepath.Join(fdDir, entry.Name()))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
const renderPrefix = "/dev/dri/render"
|
||||
if !strings.HasPrefix(target, renderPrefix) {
|
||||
continue
|
||||
}
|
||||
renderName := strings.TrimPrefix(target, "/dev/dri/")
|
||||
data, err := os.ReadFile(filepath.Join(pidDir, "fdinfo", entry.Name()))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out[renderName] += parseDRMFdInfoVRAM(data)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// parseDRMFdInfoVRAM sums `drm-total-<region>` bytes across all VRAM
|
||||
// regions in a single fdinfo blob. Values are formatted as
|
||||
// "<number> <KiB|MiB|GiB>" or bare bytes; both are accepted.
|
||||
func parseDRMFdInfoVRAM(data []byte) uint64 {
|
||||
var total uint64
|
||||
sc := bufio.NewScanner(bytes.NewReader(data))
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
const prefix = "drm-total-"
|
||||
if !strings.HasPrefix(line, prefix) {
|
||||
continue
|
||||
}
|
||||
key, value, ok := strings.Cut(line, ":")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
region := strings.TrimPrefix(key, prefix)
|
||||
if !isVRAMRegion(region) {
|
||||
continue
|
||||
}
|
||||
total += parseDRMFdInfoBytes(value)
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func isVRAMRegion(region string) bool {
|
||||
return strings.HasPrefix(region, "local") || strings.HasPrefix(region, "vram")
|
||||
}
|
||||
|
||||
func parseDRMFdInfoBytes(value string) uint64 {
|
||||
fields := strings.Fields(value)
|
||||
if len(fields) == 0 {
|
||||
return 0
|
||||
}
|
||||
n, err := strconv.ParseUint(fields[0], 10, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
if len(fields) < 2 {
|
||||
return n
|
||||
}
|
||||
switch strings.ToLower(fields[1]) {
|
||||
case "kib":
|
||||
return n * 1024
|
||||
case "mib":
|
||||
return n * 1024 * 1024
|
||||
case "gib":
|
||||
return n * 1024 * 1024 * 1024
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// renderNodeBDF resolves a DRM render-node basename (e.g. "renderD129")
|
||||
// to its underlying PCI BDF by following /sys/class/drm/<name>/device.
|
||||
// Returns "" for non-PCI devices or symlink read errors.
|
||||
func renderNodeBDF(name string) string {
|
||||
link, err := os.Readlink("/sys/class/drm/" + name + "/device")
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
base := filepath.Base(link)
|
||||
// Sanity-check: BDF format is dddd:bb:dd.f
|
||||
if strings.Count(base, ":") != 2 || strings.Count(base, ".") != 1 {
|
||||
return ""
|
||||
}
|
||||
return strings.ToLower(base)
|
||||
}
|
||||
142
pkg/xsysinfo/drmfdinfo_test.go
Normal file
142
pkg/xsysinfo/drmfdinfo_test.go
Normal file
@@ -0,0 +1,142 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// i915FdInfo is a captured /proc/<pid>/fdinfo/<fd> from a llama-cpp
|
||||
// process holding an Intel Arc render-node fd. "local0" is i915's
|
||||
// device-local VRAM region; system0 is host-visible buffer mirror.
|
||||
const i915FdInfo = `pos: 0
|
||||
flags: 02100002
|
||||
mnt_id: 16
|
||||
ino: 1234
|
||||
drm-driver: i915
|
||||
drm-client-id: 42
|
||||
drm-pdev: 0000:03:00.0
|
||||
drm-total-system0: 312 KiB
|
||||
drm-resident-system0: 312 KiB
|
||||
drm-total-local0: 5396348 KiB
|
||||
drm-resident-local0: 5396348 KiB
|
||||
drm-total-stolen-local0: 0
|
||||
drm-resident-stolen-local0: 0
|
||||
drm-engine-render: 1234567 ns
|
||||
drm-engine-copy: 2345 ns
|
||||
drm-engine-video: 0 ns
|
||||
drm-engine-capacity-video: 2
|
||||
`
|
||||
|
||||
// amdgpuFdInfo mirrors the i915 schema with AMD's region names. amdgpu
|
||||
// uses "vram0" for device-local and "gtt0" for host-pinned memory.
|
||||
const amdgpuFdInfo = `pos: 0
|
||||
flags: 02100002
|
||||
mnt_id: 16
|
||||
drm-driver: amdgpu
|
||||
drm-pdev: 0000:0a:00.0
|
||||
drm-total-vram0: 8589934592 B
|
||||
drm-resident-vram0: 8589934592 B
|
||||
drm-total-gtt0: 1048576 B
|
||||
drm-resident-gtt0: 1048576 B
|
||||
drm-engine-gfx: 123456 ns
|
||||
`
|
||||
|
||||
// systemOnlyFdInfo: a DRM client that only allocates host buffers
|
||||
// (CPU-only fallback, GUI compositor, etc.). VRAM total must be 0.
|
||||
const systemOnlyFdInfo = `drm-driver: i915
|
||||
drm-total-system0: 16384 KiB
|
||||
drm-resident-system0: 16384 KiB
|
||||
drm-total-local0: 0
|
||||
`
|
||||
|
||||
// noDRMFdInfo: regular file fd (e.g. socket, pipe). Parser must return
|
||||
// 0 without panicking.
|
||||
const noDRMFdInfo = `pos: 0
|
||||
flags: 02100002
|
||||
mnt_id: 16
|
||||
ino: 5678
|
||||
`
|
||||
|
||||
// bareBytesFdInfo: older kernels emit byte counts without a unit
|
||||
// suffix. Must be parsed as raw bytes, not multiplied by 1024.
|
||||
const bareBytesFdInfo = `drm-driver: xe
|
||||
drm-total-vram0: 17179869184
|
||||
drm-resident-vram0: 17179869184
|
||||
`
|
||||
|
||||
var _ = Describe("parseDRMFdInfoVRAM", func() {
|
||||
DescribeTable("extracts device-local VRAM totals from fdinfo",
|
||||
func(input string, want uint64) {
|
||||
Expect(parseDRMFdInfoVRAM([]byte(input))).To(Equal(want))
|
||||
},
|
||||
Entry("empty input", "", uint64(0)),
|
||||
Entry("non-DRM fdinfo", noDRMFdInfo, uint64(0)),
|
||||
Entry("system-only client reports 0 VRAM", systemOnlyFdInfo, uint64(0)),
|
||||
Entry("i915 local0 in KiB", i915FdInfo, uint64(5396348*1024)),
|
||||
Entry("amdgpu vram0 in bytes", amdgpuFdInfo, uint64(8589934592)),
|
||||
Entry("xe vram0 bare bytes", bareBytesFdInfo, uint64(17179869184)),
|
||||
)
|
||||
})
|
||||
|
||||
var _ = Describe("parseDRMFdInfoBytes", func() {
|
||||
DescribeTable("parses sizes with and without unit suffixes",
|
||||
func(in string, want uint64) {
|
||||
Expect(parseDRMFdInfoBytes(in)).To(Equal(want))
|
||||
},
|
||||
Entry("bare bytes", "\t1024", uint64(1024)),
|
||||
Entry("KiB", "\t1024 KiB", uint64(1024*1024)),
|
||||
Entry("MiB", "\t512 MiB", uint64(512*1024*1024)),
|
||||
Entry("GiB", "\t2 GiB", uint64(2*1024*1024*1024)),
|
||||
Entry("unrecognised unit falls through to raw bytes", "\t1024 B", uint64(1024)),
|
||||
Entry("empty", "", uint64(0)),
|
||||
Entry("not a number", "\tnotanumber KiB", uint64(0)),
|
||||
)
|
||||
})
|
||||
|
||||
var _ = Describe("isVRAMRegion", func() {
|
||||
DescribeTable("recognises device-local regions",
|
||||
func(region string, want bool) {
|
||||
Expect(isVRAMRegion(region)).To(Equal(want))
|
||||
},
|
||||
Entry("local0", "local0", true),
|
||||
Entry("local1", "local1", true),
|
||||
Entry("vram0", "vram0", true),
|
||||
Entry("vram1", "vram1", true),
|
||||
Entry("system0", "system0", false),
|
||||
Entry("gtt0", "gtt0", false),
|
||||
Entry("stolen-local0", "stolen-local0", false),
|
||||
Entry("stolen-system0", "stolen-system0", false),
|
||||
Entry("cpu", "cpu", false),
|
||||
)
|
||||
})
|
||||
|
||||
var _ = Describe("applyDRMUsage", func() {
|
||||
const total = uint64(16225243136)
|
||||
base := GPUMemoryInfo{Name: "Arc A770", TotalVRAM: total, FreeVRAM: total}
|
||||
|
||||
It("leaves defaults untouched when there is no usage", func() {
|
||||
got := applyDRMUsage(base, 0)
|
||||
Expect(got.UsedVRAM).To(Equal(uint64(0)))
|
||||
Expect(got.FreeVRAM).To(Equal(total))
|
||||
Expect(got.UsagePercent).To(Equal(float64(0)))
|
||||
})
|
||||
|
||||
It("rederives free and percent from usage", func() {
|
||||
used := uint64(5_396_348 * 1024)
|
||||
got := applyDRMUsage(base, used)
|
||||
Expect(got.UsedVRAM).To(Equal(used))
|
||||
Expect(got.FreeVRAM).To(Equal(total - used))
|
||||
Expect(got.UsagePercent).To(Equal(float64(used) / float64(total) * 100))
|
||||
})
|
||||
|
||||
It("clamps over-commit to total", func() {
|
||||
got := applyDRMUsage(base, total*2)
|
||||
Expect(got.UsedVRAM).To(Equal(total))
|
||||
Expect(got.FreeVRAM).To(Equal(uint64(0)))
|
||||
})
|
||||
|
||||
It("guards against div-by-zero on zero total", func() {
|
||||
got := applyDRMUsage(GPUMemoryInfo{}, 1024)
|
||||
Expect(got.UsagePercent).To(Equal(float64(0)))
|
||||
})
|
||||
})
|
||||
@@ -41,6 +41,13 @@ type GPUMemoryInfo struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
Vendor string `json:"vendor"`
|
||||
// BDF is the canonical PCI bus address (dddd:bb:dd.f) when known.
|
||||
// Populated by detection paths that can attribute the device to a
|
||||
// PCI location (clinfo, future amdgpu/nvidia paths); empty for
|
||||
// non-PCI devices (Apple, integrated SoCs) or detection paths
|
||||
// that don't surface it (nvidia-smi --query-gpu doesn't include
|
||||
// pci.bus_id by default).
|
||||
BDF string `json:"bdf,omitempty"`
|
||||
TotalVRAM uint64 `json:"total_vram"` // Total VRAM in bytes
|
||||
UsedVRAM uint64 `json:"used_vram"` // Used VRAM in bytes
|
||||
FreeVRAM uint64 `json:"free_vram"` // Free VRAM in bytes
|
||||
@@ -515,16 +522,48 @@ func getAMDGPUMemory() []GPUMemoryInfo {
|
||||
return gpus
|
||||
}
|
||||
|
||||
// getIntelGPUMemory queries Intel GPUs using xpu-smi or intel_gpu_top
|
||||
// getIntelGPUMemory queries Intel GPUs via xpu-smi, intel_gpu_top, or
|
||||
// clinfo (in that order). xpu-smi is the canonical Intel tool but
|
||||
// requires the separate xpumanager package; clinfo ships with the
|
||||
// OpenCL ICD loader and is present in most oneAPI base images, so it
|
||||
// serves as the last-resort fallback.
|
||||
func getIntelGPUMemory() []GPUMemoryInfo {
|
||||
// Try xpu-smi first (Intel's official GPU management tool)
|
||||
gpus := getIntelXPUSMI()
|
||||
if len(gpus) > 0 {
|
||||
if gpus := getIntelXPUSMI(); len(gpus) > 0 {
|
||||
return gpus
|
||||
}
|
||||
if gpus := getIntelGPUTop(); len(gpus) > 0 {
|
||||
return gpus
|
||||
}
|
||||
// clinfo enumerates every OpenCL platform, so guard the
|
||||
// subprocess with the cached ghw GPU list: non-Intel hosts skip
|
||||
// it entirely.
|
||||
if !hasGHWVendor(VendorIntel) {
|
||||
return nil
|
||||
}
|
||||
var out []GPUMemoryInfo
|
||||
for _, g := range getCLInfoGPUMemory() {
|
||||
if g.Vendor == VendorIntel {
|
||||
out = append(out, g)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// Fallback to intel_gpu_top
|
||||
return getIntelGPUTop()
|
||||
// hasGHWVendor reports whether ghw observed any GPU whose vendor name
|
||||
// matches (case-insensitive substring). Uses the package-level cache
|
||||
// in GPUs() so the call is free after the first invocation.
|
||||
func hasGHWVendor(vendor string) bool {
|
||||
gpus, _ := GPUs()
|
||||
target := strings.ToUpper(vendor)
|
||||
for _, g := range gpus {
|
||||
if g.DeviceInfo == nil || g.DeviceInfo.Vendor == nil {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(strings.ToUpper(g.DeviceInfo.Vendor.Name), target) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// getIntelXPUSMI queries Intel GPUs using xpu-smi
|
||||
|
||||
13
pkg/xsysinfo/xsysinfo_suite_test.go
Normal file
13
pkg/xsysinfo/xsysinfo_suite_test.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
func TestXsysinfo(t *testing.T) {
|
||||
RegisterFailHandler(Fail)
|
||||
RunSpecs(t, "xsysinfo test suite")
|
||||
}
|
||||
Reference in New Issue
Block a user