diff --git a/pkg/xsysinfo/clinfo.go b/pkg/xsysinfo/clinfo.go new file mode 100644 index 000000000..9d8694b47 --- /dev/null +++ b/pkg/xsysinfo/clinfo.go @@ -0,0 +1,221 @@ +package xsysinfo + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "os/exec" + "strings" + "sync" + "time" + + "github.com/mudler/xlog" +) + +const ( + clDeviceTypeGPU = "CL_DEVICE_TYPE_GPU" + clinfoTimeout = 2 * time.Second +) + +// clinfoOutput is the subset of `clinfo --json` we read. clinfo emits +// one entry under "devices" per platform, in the same order as +// "platforms"; live devices are under "online". +type clinfoOutput struct { + Devices []struct { + Online []clinfoDevice `json:"online"` + } `json:"devices"` +} + +type clinfoDevice struct { + Name string `json:"CL_DEVICE_NAME"` + Vendor string `json:"CL_DEVICE_VENDOR"` + VendorID uint32 `json:"CL_DEVICE_VENDOR_ID"` + Type clinfoTypeProp `json:"CL_DEVICE_TYPE"` + HostUnifiedMemory bool `json:"CL_DEVICE_HOST_UNIFIED_MEMORY"` + GlobalMemSize uint64 `json:"CL_DEVICE_GLOBAL_MEM_SIZE"` + PCIBusInfoKHR string `json:"CL_DEVICE_PCI_BUS_INFO_KHR"` + PCIDomainNV int `json:"CL_DEVICE_PCI_DOMAIN_ID_NV"` + PCIBusNV int `json:"CL_DEVICE_PCI_BUS_ID_NV"` + PCISlotNV int `json:"CL_DEVICE_PCI_SLOT_ID_NV"` +} + +// clinfoTypeProp matches against the type-string array rather than +// CL_DEVICE_TYPE.raw so a future CL_DEVICE_TYPE_CUSTOM can't sneak +// past as a GPU. +type clinfoTypeProp struct { + Raw uint32 `json:"raw"` + Type []string `json:"type"` +} + +func (t clinfoTypeProp) isGPU() bool { + for _, s := range t.Type { + if s == clDeviceTypeGPU { + return true + } + } + return false +} + +// clinfoOnce caches the result for the process lifetime. GPU hardware +// doesn't change between calls and the subprocess is ~150 ms. +var clinfoOnce = sync.OnceValue(runCLInfo) + +func runCLInfo() []GPUMemoryInfo { + if _, err := exec.LookPath("clinfo"); err != nil { + return nil + } + ctx, cancel := context.WithTimeout(context.Background(), clinfoTimeout) + defer cancel() + cmd := exec.CommandContext(ctx, "clinfo", "--json") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + xlog.Debug("clinfo failed", "error", err, "stderr", stderr.String()) + return nil + } + return parseCLInfoJSON(stdout.Bytes()) +} + +// getCLInfoGPUMemory is a best-effort fallback for hosts where the +// vendor's own management binary (nvidia-smi / xpu-smi / rocm-smi) +// isn't installed but the OpenCL ICD is. Live used/free aren't exposed +// via standard CL_ properties; we synthesise them by attributing +// per-process VRAM allocations from the kernel DRM fdinfo interface +// to each clinfo-reported GPU via the shared PCI BDF. +func getCLInfoGPUMemory() []GPUMemoryInfo { + gpus := clinfoOnce() + if len(gpus) == 0 { + return nil + } + usage := drmFdInfoUsageByBDF() + for i := range gpus { + gpus[i] = applyDRMUsage(gpus[i], usage[gpus[i].BDF]) + } + return gpus +} + +// applyDRMUsage stamps live VRAM accounting onto a GPUMemoryInfo +// whose TotalVRAM came from a static source (e.g. clinfo). Caller +// already populated TotalVRAM and FreeVRAM=TotalVRAM as defaults; if +// DRM accounting reports usage, we trust it and rederive free/percent. +func applyDRMUsage(g GPUMemoryInfo, used uint64) GPUMemoryInfo { + if used == 0 || g.TotalVRAM == 0 { + return g + } + if used > g.TotalVRAM { + // Process-private DRM total can momentarily exceed device + // VRAM (over-commit via host memory mirror). Clamp so the UI + // doesn't display absurd percentages. + used = g.TotalVRAM + } + g.UsedVRAM = used + g.FreeVRAM = g.TotalVRAM - used + g.UsagePercent = float64(used) / float64(g.TotalVRAM) * 100 + return g +} + +// parseCLInfoJSON returns one GPUMemoryInfo per discrete GPU. UMA +// devices (iGPU/APU) are dropped because their "VRAM" is system RAM +// and would double-count against the capability gate. When the same +// physical device is enumerated by multiple ICDs (Intel OpenCL + POCL, +// for example), the BDF dedup keeps the largest reported size — some +// ICDs cap at 4 GiB for legacy alloc-size compatibility. +func parseCLInfoJSON(raw []byte) []GPUMemoryInfo { + var out clinfoOutput + if err := json.Unmarshal(raw, &out); err != nil { + xlog.Debug("clinfo: failed to parse --json output", "error", err) + return nil + } + + byBDF := map[string]GPUMemoryInfo{} + var noBDF []GPUMemoryInfo + + for _, plat := range out.Devices { + for _, d := range plat.Online { + if !d.Type.isGPU() || d.HostUnifiedMemory || d.GlobalMemSize == 0 { + continue + } + bdf := clinfoBDF(d) + info := GPUMemoryInfo{ + Name: strings.TrimSpace(d.Name), + Vendor: clinfoVendor(d.VendorID, d.Vendor), + BDF: bdf, + TotalVRAM: d.GlobalMemSize, + FreeVRAM: d.GlobalMemSize, + } + if bdf == "" { + noBDF = append(noBDF, info) + continue + } + if existing, ok := byBDF[bdf]; !ok || info.TotalVRAM > existing.TotalVRAM { + byBDF[bdf] = info + } + } + } + + all := make([]GPUMemoryInfo, 0, len(byBDF)+len(noBDF)) + for _, g := range byBDF { + all = append(all, g) + } + all = append(all, noBDF...) + for i := range all { + all[i].Index = i + } + return all +} + +func clinfoVendor(vendorID uint32, name string) string { + switch vendorID { + case 0x10de: + return VendorNVIDIA + case 0x1002, 0x1022: // 0x1022 is the AMD CPU vendor ID, also reported by some APU OpenCL devices. + return VendorAMD + case 0x8086: + return VendorIntel + case 0x106B: + return VendorApple + } + n := strings.ToLower(name) + switch { + case strings.Contains(n, "nvidia"): + return VendorNVIDIA + case strings.Contains(n, "advanced micro devices"), strings.Contains(n, "amd"): + return VendorAMD + case strings.Contains(n, "intel"): + return VendorIntel + case strings.Contains(n, "apple"): + return VendorApple + } + return VendorUnknown +} + +// clinfoBDF returns the device's canonical `dddd:bb:dd.f` PCI address, +// or "" when no PCI location is reported. The KHR form is `"PCI-E, +// 0000:01:00.0"` on NVIDIA and bare `"0000:01:00.0"` on most others. +func clinfoBDF(d clinfoDevice) string { + if d.PCIBusInfoKHR != "" { + s := d.PCIBusInfoKHR + if i := strings.LastIndex(s, " "); i >= 0 { + s = s[i+1:] + } + if c := strings.Count(s, ":"); c == 1 || c == 2 { + return normalizeBDF(s) + } + } + // NVIDIA pre-KHR per-axis fields. An all-zero result is + // indistinguishable from "fields absent", but no GPU sits at + // 0000:00:00.0 so the false negative is harmless. + if d.PCIBusNV != 0 || d.PCISlotNV != 0 || d.PCIDomainNV != 0 { + return fmt.Sprintf("%04x:%02x:%02x.0", d.PCIDomainNV, d.PCIBusNV, d.PCISlotNV) + } + return "" +} + +func normalizeBDF(s string) string { + if strings.Count(s, ":") == 1 { + return strings.ToLower("0000:" + s) + } + return strings.ToLower(s) +} diff --git a/pkg/xsysinfo/clinfo_test.go b/pkg/xsysinfo/clinfo_test.go new file mode 100644 index 000000000..c5dea34de --- /dev/null +++ b/pkg/xsysinfo/clinfo_test.go @@ -0,0 +1,191 @@ +package xsysinfo + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +const nvidiaRTX5070TiJSON = `{ + "devices": [ + { + "online": [ + { + "CL_DEVICE_NAME": "NVIDIA GeForce RTX 5070 Ti", + "CL_DEVICE_VENDOR": "NVIDIA Corporation", + "CL_DEVICE_VENDOR_ID": 4318, + "CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]}, + "CL_DEVICE_HOST_UNIFIED_MEMORY": false, + "CL_DEVICE_GLOBAL_MEM_SIZE": 16609378304, + "CL_DEVICE_PCI_BUS_INFO_KHR": "PCI-E, 0000:01:00.0", + "CL_DEVICE_PCI_BUS_ID_NV": 1, + "CL_DEVICE_PCI_SLOT_ID_NV": 0, + "CL_DEVICE_PCI_DOMAIN_ID_NV": 0 + } + ] + } + ] +}` + +// intelArcPlusIGPUJSON exercises the HOST_UNIFIED_MEMORY=true filter: +// the iGPU sibling on the same Intel platform must be dropped to +// avoid double-counting system RAM as VRAM. +const intelArcPlusIGPUJSON = `{ + "devices": [ + { + "online": [ + { + "CL_DEVICE_NAME": "Intel(R) Arc(TM) A770 Graphics", + "CL_DEVICE_VENDOR": "Intel(R) Corporation", + "CL_DEVICE_VENDOR_ID": 32902, + "CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]}, + "CL_DEVICE_HOST_UNIFIED_MEMORY": false, + "CL_DEVICE_GLOBAL_MEM_SIZE": 16225243136, + "CL_DEVICE_PCI_BUS_INFO_KHR": "0000:03:00.0" + }, + { + "CL_DEVICE_NAME": "Intel(R) UHD Graphics 770", + "CL_DEVICE_VENDOR": "Intel(R) Corporation", + "CL_DEVICE_VENDOR_ID": 32902, + "CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]}, + "CL_DEVICE_HOST_UNIFIED_MEMORY": true, + "CL_DEVICE_GLOBAL_MEM_SIZE": 25000000000, + "CL_DEVICE_PCI_BUS_INFO_KHR": "0000:00:02.0" + } + ] + } + ] +}` + +// dualICDSameDeviceJSON exercises BDF dedup when two ICDs enumerate +// the same physical device with different reported sizes (POCL caps +// at 4 GiB for legacy alloc-size compatibility). +const dualICDSameDeviceJSON = `{ + "devices": [ + { + "online": [ + { + "CL_DEVICE_NAME": "Intel(R) Arc(TM) A770 Graphics", + "CL_DEVICE_VENDOR_ID": 32902, + "CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]}, + "CL_DEVICE_HOST_UNIFIED_MEMORY": false, + "CL_DEVICE_GLOBAL_MEM_SIZE": 16225243136, + "CL_DEVICE_PCI_BUS_INFO_KHR": "0000:03:00.0" + } + ] + }, + { + "online": [ + { + "CL_DEVICE_NAME": "pthread-Arc-A770", + "CL_DEVICE_VENDOR_ID": 32902, + "CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]}, + "CL_DEVICE_HOST_UNIFIED_MEMORY": false, + "CL_DEVICE_GLOBAL_MEM_SIZE": 4294967296, + "CL_DEVICE_PCI_BUS_INFO_KHR": "0000:03:00.0" + } + ] + } + ] +}` + +// cpuOnlyJSON: a POCL-only host. Filtered by CL_DEVICE_TYPE — without +// this guard CPU memory would be mistakenly reported as VRAM. +const cpuOnlyJSON = `{ + "devices": [ + { + "online": [ + { + "CL_DEVICE_NAME": "pthread-x86_64", + "CL_DEVICE_VENDOR": "GenuineIntel", + "CL_DEVICE_VENDOR_ID": 32902, + "CL_DEVICE_TYPE": {"raw": 2, "type": ["CL_DEVICE_TYPE_CPU"]}, + "CL_DEVICE_HOST_UNIFIED_MEMORY": true, + "CL_DEVICE_GLOBAL_MEM_SIZE": 33324494848 + } + ] + } + ] +}` + +// noBDFJSON: an ICD that reports no PCI fields at all. Device is +// still counted but doesn't participate in dedup. +const noBDFJSON = `{ + "devices": [ + { + "online": [ + { + "CL_DEVICE_NAME": "Some Accelerator GPU", + "CL_DEVICE_VENDOR_ID": 0, + "CL_DEVICE_TYPE": {"raw": 4, "type": ["CL_DEVICE_TYPE_GPU"]}, + "CL_DEVICE_HOST_UNIFIED_MEMORY": false, + "CL_DEVICE_GLOBAL_MEM_SIZE": 8589934592 + } + ] + } + ] +}` + +var _ = Describe("parseCLInfoJSON", func() { + DescribeTable("classifies and dedups clinfo devices", + func(input string, wantCount int, want []GPUMemoryInfo) { + got := parseCLInfoJSON([]byte(input)) + Expect(got).To(HaveLen(wantCount)) + for i, w := range want { + Expect(got[i].Name).To(Equal(w.Name)) + Expect(got[i].Vendor).To(Equal(w.Vendor)) + Expect(got[i].TotalVRAM).To(Equal(w.TotalVRAM)) + } + }, + Entry("empty object returns nothing", `{}`, 0, nil), + Entry("malformed JSON returns nothing without panicking", `{not valid`, 0, nil), + Entry("CPU-only platform is filtered out", cpuOnlyJSON, 0, nil), + Entry("NVIDIA dGPU is recognised by vendor ID and BDF", + nvidiaRTX5070TiJSON, 1, []GPUMemoryInfo{{ + Name: "NVIDIA GeForce RTX 5070 Ti", + Vendor: VendorNVIDIA, + TotalVRAM: 16609378304, + }}), + Entry("Intel Arc with iGPU sibling: iGPU dropped, Arc reported", + intelArcPlusIGPUJSON, 1, []GPUMemoryInfo{{ + Name: "Intel(R) Arc(TM) A770 Graphics", + Vendor: VendorIntel, + TotalVRAM: 16225243136, + }}), + Entry("Dual ICD enumerating same Arc: deduped, larger size wins", + dualICDSameDeviceJSON, 1, []GPUMemoryInfo{{ + Name: "Intel(R) Arc(TM) A770 Graphics", + Vendor: VendorIntel, + TotalVRAM: 16225243136, // not the POCL 4 GiB cap + }}), + Entry("Device without PCI info is still counted", + noBDFJSON, 1, []GPUMemoryInfo{{ + Name: "Some Accelerator GPU", + Vendor: VendorUnknown, + TotalVRAM: 8589934592, + }}), + ) +}) + +var _ = Describe("normalizeBDF", func() { + DescribeTable("canonicalises PCI bus addresses", + func(in, want string) { + Expect(normalizeBDF(in)).To(Equal(want)) + }, + Entry("already canonical", "0000:03:00.0", "0000:03:00.0"), + Entry("missing domain", "03:00.0", "0000:03:00.0"), + Entry("uppercase hex", "AB:CD.0", "0000:ab:cd.0"), + ) +}) + +var _ = Describe("clinfoBDF", func() { + It("synthesises a canonical BDF from NVIDIA pre-KHR integer fields", func() { + // Older NVIDIA OpenCL exposes BDF via three integer fields instead + // of the KHR string; the synthesised result must be canonical. + d := clinfoDevice{ + PCIBusNV: 1, + PCISlotNV: 0, + PCIDomainNV: 0, + } + Expect(clinfoBDF(d)).To(Equal("0000:01:00.0")) + }) +}) diff --git a/pkg/xsysinfo/drmfdinfo.go b/pkg/xsysinfo/drmfdinfo.go new file mode 100644 index 000000000..9f96d6e40 --- /dev/null +++ b/pkg/xsysinfo/drmfdinfo.go @@ -0,0 +1,147 @@ +package xsysinfo + +import ( + "bufio" + "bytes" + "os" + "path/filepath" + "strconv" + "strings" +) + +// drmFdInfoUsageByBDF walks /proc//fdinfo/ for every fd that +// points at /dev/dri/render* and aggregates per-GPU VRAM allocations. +// Keyed by the PCI BDF (dddd:bb:dd.f) of the render node so callers +// can match against any GPU detection result. +// +// The kernel exposes per-process DRM accounting via standardised +// fdinfo keys (Documentation/gpu/drm-usage-stats.rst, kernel ≥5.19): +// +// drm-total-: bytes the process has bound to +// drm-resident-: bytes currently resident in +// +// Region names are driver-defined: i915 uses "local*" for device-local +// VRAM, amdgpu and xe use "vram*". We sum any region whose name +// starts with "local" or "vram"; "system*" / "gtt*" / "stolen-*" are +// excluded since they're host RAM mirrors. +// +// Returns an empty map when no process holds a DRM render fd or the +// kernel doesn't emit the accounting keys (older kernels, exotic +// drivers). The walker is read-only and survives unreadable proc +// entries (other users' processes, transient PIDs). +func drmFdInfoUsageByBDF() map[string]uint64 { + byRender := drmFdInfoUsageByRenderNode() + if len(byRender) == 0 { + return nil + } + out := make(map[string]uint64, len(byRender)) + for name, used := range byRender { + bdf := renderNodeBDF(name) + if bdf == "" { + continue + } + out[bdf] += used + } + return out +} + +func drmFdInfoUsageByRenderNode() map[string]uint64 { + procs, _ := filepath.Glob("/proc/[0-9]*/fd") + if len(procs) == 0 { + return nil + } + out := map[string]uint64{} + for _, fdDir := range procs { + pidDir := filepath.Dir(fdDir) + entries, err := os.ReadDir(fdDir) + if err != nil { + // /proc race: process exited or unreadable. Skip silently. + continue + } + for _, entry := range entries { + target, err := os.Readlink(filepath.Join(fdDir, entry.Name())) + if err != nil { + continue + } + const renderPrefix = "/dev/dri/render" + if !strings.HasPrefix(target, renderPrefix) { + continue + } + renderName := strings.TrimPrefix(target, "/dev/dri/") + data, err := os.ReadFile(filepath.Join(pidDir, "fdinfo", entry.Name())) + if err != nil { + continue + } + out[renderName] += parseDRMFdInfoVRAM(data) + } + } + return out +} + +// parseDRMFdInfoVRAM sums `drm-total-` bytes across all VRAM +// regions in a single fdinfo blob. Values are formatted as +// " " or bare bytes; both are accepted. +func parseDRMFdInfoVRAM(data []byte) uint64 { + var total uint64 + sc := bufio.NewScanner(bytes.NewReader(data)) + for sc.Scan() { + line := sc.Text() + const prefix = "drm-total-" + if !strings.HasPrefix(line, prefix) { + continue + } + key, value, ok := strings.Cut(line, ":") + if !ok { + continue + } + region := strings.TrimPrefix(key, prefix) + if !isVRAMRegion(region) { + continue + } + total += parseDRMFdInfoBytes(value) + } + return total +} + +func isVRAMRegion(region string) bool { + return strings.HasPrefix(region, "local") || strings.HasPrefix(region, "vram") +} + +func parseDRMFdInfoBytes(value string) uint64 { + fields := strings.Fields(value) + if len(fields) == 0 { + return 0 + } + n, err := strconv.ParseUint(fields[0], 10, 64) + if err != nil { + return 0 + } + if len(fields) < 2 { + return n + } + switch strings.ToLower(fields[1]) { + case "kib": + return n * 1024 + case "mib": + return n * 1024 * 1024 + case "gib": + return n * 1024 * 1024 * 1024 + } + return n +} + +// renderNodeBDF resolves a DRM render-node basename (e.g. "renderD129") +// to its underlying PCI BDF by following /sys/class/drm//device. +// Returns "" for non-PCI devices or symlink read errors. +func renderNodeBDF(name string) string { + link, err := os.Readlink("/sys/class/drm/" + name + "/device") + if err != nil { + return "" + } + base := filepath.Base(link) + // Sanity-check: BDF format is dddd:bb:dd.f + if strings.Count(base, ":") != 2 || strings.Count(base, ".") != 1 { + return "" + } + return strings.ToLower(base) +} diff --git a/pkg/xsysinfo/drmfdinfo_test.go b/pkg/xsysinfo/drmfdinfo_test.go new file mode 100644 index 000000000..2729d39b0 --- /dev/null +++ b/pkg/xsysinfo/drmfdinfo_test.go @@ -0,0 +1,142 @@ +package xsysinfo + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// i915FdInfo is a captured /proc//fdinfo/ from a llama-cpp +// process holding an Intel Arc render-node fd. "local0" is i915's +// device-local VRAM region; system0 is host-visible buffer mirror. +const i915FdInfo = `pos: 0 +flags: 02100002 +mnt_id: 16 +ino: 1234 +drm-driver: i915 +drm-client-id: 42 +drm-pdev: 0000:03:00.0 +drm-total-system0: 312 KiB +drm-resident-system0: 312 KiB +drm-total-local0: 5396348 KiB +drm-resident-local0: 5396348 KiB +drm-total-stolen-local0: 0 +drm-resident-stolen-local0: 0 +drm-engine-render: 1234567 ns +drm-engine-copy: 2345 ns +drm-engine-video: 0 ns +drm-engine-capacity-video: 2 +` + +// amdgpuFdInfo mirrors the i915 schema with AMD's region names. amdgpu +// uses "vram0" for device-local and "gtt0" for host-pinned memory. +const amdgpuFdInfo = `pos: 0 +flags: 02100002 +mnt_id: 16 +drm-driver: amdgpu +drm-pdev: 0000:0a:00.0 +drm-total-vram0: 8589934592 B +drm-resident-vram0: 8589934592 B +drm-total-gtt0: 1048576 B +drm-resident-gtt0: 1048576 B +drm-engine-gfx: 123456 ns +` + +// systemOnlyFdInfo: a DRM client that only allocates host buffers +// (CPU-only fallback, GUI compositor, etc.). VRAM total must be 0. +const systemOnlyFdInfo = `drm-driver: i915 +drm-total-system0: 16384 KiB +drm-resident-system0: 16384 KiB +drm-total-local0: 0 +` + +// noDRMFdInfo: regular file fd (e.g. socket, pipe). Parser must return +// 0 without panicking. +const noDRMFdInfo = `pos: 0 +flags: 02100002 +mnt_id: 16 +ino: 5678 +` + +// bareBytesFdInfo: older kernels emit byte counts without a unit +// suffix. Must be parsed as raw bytes, not multiplied by 1024. +const bareBytesFdInfo = `drm-driver: xe +drm-total-vram0: 17179869184 +drm-resident-vram0: 17179869184 +` + +var _ = Describe("parseDRMFdInfoVRAM", func() { + DescribeTable("extracts device-local VRAM totals from fdinfo", + func(input string, want uint64) { + Expect(parseDRMFdInfoVRAM([]byte(input))).To(Equal(want)) + }, + Entry("empty input", "", uint64(0)), + Entry("non-DRM fdinfo", noDRMFdInfo, uint64(0)), + Entry("system-only client reports 0 VRAM", systemOnlyFdInfo, uint64(0)), + Entry("i915 local0 in KiB", i915FdInfo, uint64(5396348*1024)), + Entry("amdgpu vram0 in bytes", amdgpuFdInfo, uint64(8589934592)), + Entry("xe vram0 bare bytes", bareBytesFdInfo, uint64(17179869184)), + ) +}) + +var _ = Describe("parseDRMFdInfoBytes", func() { + DescribeTable("parses sizes with and without unit suffixes", + func(in string, want uint64) { + Expect(parseDRMFdInfoBytes(in)).To(Equal(want)) + }, + Entry("bare bytes", "\t1024", uint64(1024)), + Entry("KiB", "\t1024 KiB", uint64(1024*1024)), + Entry("MiB", "\t512 MiB", uint64(512*1024*1024)), + Entry("GiB", "\t2 GiB", uint64(2*1024*1024*1024)), + Entry("unrecognised unit falls through to raw bytes", "\t1024 B", uint64(1024)), + Entry("empty", "", uint64(0)), + Entry("not a number", "\tnotanumber KiB", uint64(0)), + ) +}) + +var _ = Describe("isVRAMRegion", func() { + DescribeTable("recognises device-local regions", + func(region string, want bool) { + Expect(isVRAMRegion(region)).To(Equal(want)) + }, + Entry("local0", "local0", true), + Entry("local1", "local1", true), + Entry("vram0", "vram0", true), + Entry("vram1", "vram1", true), + Entry("system0", "system0", false), + Entry("gtt0", "gtt0", false), + Entry("stolen-local0", "stolen-local0", false), + Entry("stolen-system0", "stolen-system0", false), + Entry("cpu", "cpu", false), + ) +}) + +var _ = Describe("applyDRMUsage", func() { + const total = uint64(16225243136) + base := GPUMemoryInfo{Name: "Arc A770", TotalVRAM: total, FreeVRAM: total} + + It("leaves defaults untouched when there is no usage", func() { + got := applyDRMUsage(base, 0) + Expect(got.UsedVRAM).To(Equal(uint64(0))) + Expect(got.FreeVRAM).To(Equal(total)) + Expect(got.UsagePercent).To(Equal(float64(0))) + }) + + It("rederives free and percent from usage", func() { + used := uint64(5_396_348 * 1024) + got := applyDRMUsage(base, used) + Expect(got.UsedVRAM).To(Equal(used)) + Expect(got.FreeVRAM).To(Equal(total - used)) + Expect(got.UsagePercent).To(Equal(float64(used) / float64(total) * 100)) + }) + + It("clamps over-commit to total", func() { + got := applyDRMUsage(base, total*2) + Expect(got.UsedVRAM).To(Equal(total)) + Expect(got.FreeVRAM).To(Equal(uint64(0))) + }) + + It("guards against div-by-zero on zero total", func() { + got := applyDRMUsage(GPUMemoryInfo{}, 1024) + Expect(got.UsagePercent).To(Equal(float64(0))) + }) +}) diff --git a/pkg/xsysinfo/gpu.go b/pkg/xsysinfo/gpu.go index a1c98091e..a5575edb8 100644 --- a/pkg/xsysinfo/gpu.go +++ b/pkg/xsysinfo/gpu.go @@ -41,6 +41,13 @@ type GPUMemoryInfo struct { Index int `json:"index"` Name string `json:"name"` Vendor string `json:"vendor"` + // BDF is the canonical PCI bus address (dddd:bb:dd.f) when known. + // Populated by detection paths that can attribute the device to a + // PCI location (clinfo, future amdgpu/nvidia paths); empty for + // non-PCI devices (Apple, integrated SoCs) or detection paths + // that don't surface it (nvidia-smi --query-gpu doesn't include + // pci.bus_id by default). + BDF string `json:"bdf,omitempty"` TotalVRAM uint64 `json:"total_vram"` // Total VRAM in bytes UsedVRAM uint64 `json:"used_vram"` // Used VRAM in bytes FreeVRAM uint64 `json:"free_vram"` // Free VRAM in bytes @@ -515,16 +522,48 @@ func getAMDGPUMemory() []GPUMemoryInfo { return gpus } -// getIntelGPUMemory queries Intel GPUs using xpu-smi or intel_gpu_top +// getIntelGPUMemory queries Intel GPUs via xpu-smi, intel_gpu_top, or +// clinfo (in that order). xpu-smi is the canonical Intel tool but +// requires the separate xpumanager package; clinfo ships with the +// OpenCL ICD loader and is present in most oneAPI base images, so it +// serves as the last-resort fallback. func getIntelGPUMemory() []GPUMemoryInfo { - // Try xpu-smi first (Intel's official GPU management tool) - gpus := getIntelXPUSMI() - if len(gpus) > 0 { + if gpus := getIntelXPUSMI(); len(gpus) > 0 { return gpus } + if gpus := getIntelGPUTop(); len(gpus) > 0 { + return gpus + } + // clinfo enumerates every OpenCL platform, so guard the + // subprocess with the cached ghw GPU list: non-Intel hosts skip + // it entirely. + if !hasGHWVendor(VendorIntel) { + return nil + } + var out []GPUMemoryInfo + for _, g := range getCLInfoGPUMemory() { + if g.Vendor == VendorIntel { + out = append(out, g) + } + } + return out +} - // Fallback to intel_gpu_top - return getIntelGPUTop() +// hasGHWVendor reports whether ghw observed any GPU whose vendor name +// matches (case-insensitive substring). Uses the package-level cache +// in GPUs() so the call is free after the first invocation. +func hasGHWVendor(vendor string) bool { + gpus, _ := GPUs() + target := strings.ToUpper(vendor) + for _, g := range gpus { + if g.DeviceInfo == nil || g.DeviceInfo.Vendor == nil { + continue + } + if strings.Contains(strings.ToUpper(g.DeviceInfo.Vendor.Name), target) { + return true + } + } + return false } // getIntelXPUSMI queries Intel GPUs using xpu-smi diff --git a/pkg/xsysinfo/xsysinfo_suite_test.go b/pkg/xsysinfo/xsysinfo_suite_test.go new file mode 100644 index 000000000..f9e0d1b01 --- /dev/null +++ b/pkg/xsysinfo/xsysinfo_suite_test.go @@ -0,0 +1,13 @@ +package xsysinfo + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestXsysinfo(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "xsysinfo test suite") +}