LocalAI/pkg/xsysinfo/clinfo.go

package xsysinfo

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"os/exec"
	"strings"
	"sync"
	"time"

	"github.com/mudler/xlog"
)

const (
	clDeviceTypeGPU = "CL_DEVICE_TYPE_GPU"
	clinfoTimeout   = 2 * time.Second
)

// clinfoOutput is the subset of `clinfo --json` we read. clinfo emits
// one entry under "devices" per platform, in the same order as
// "platforms"; live devices are under "online".
type clinfoOutput struct {
	Devices []struct {
		Online []clinfoDevice `json:"online"`
	} `json:"devices"`
}

type clinfoDevice struct {
	Name              string         `json:"CL_DEVICE_NAME"`
	Vendor            string         `json:"CL_DEVICE_VENDOR"`
	VendorID          uint32         `json:"CL_DEVICE_VENDOR_ID"`
	Type              clinfoTypeProp `json:"CL_DEVICE_TYPE"`
	HostUnifiedMemory bool           `json:"CL_DEVICE_HOST_UNIFIED_MEMORY"`
	GlobalMemSize     uint64         `json:"CL_DEVICE_GLOBAL_MEM_SIZE"`
	PCIBusInfoKHR     string         `json:"CL_DEVICE_PCI_BUS_INFO_KHR"`
	PCIDomainNV       int            `json:"CL_DEVICE_PCI_DOMAIN_ID_NV"`
	PCIBusNV          int            `json:"CL_DEVICE_PCI_BUS_ID_NV"`
	PCISlotNV         int            `json:"CL_DEVICE_PCI_SLOT_ID_NV"`
}

// clinfoTypeProp matches against the type-string array rather than
// CL_DEVICE_TYPE.raw so a future CL_DEVICE_TYPE_CUSTOM can't sneak
// past as a GPU.
type clinfoTypeProp struct {
	Raw  uint32   `json:"raw"`
	Type []string `json:"type"`
}

func (t clinfoTypeProp) isGPU() bool {
	for _, s := range t.Type {
		if s == clDeviceTypeGPU {
			return true
		}
	}
	return false
}

// clinfoOnce caches the result for the process lifetime. GPU hardware
// doesn't change between calls and the subprocess is ~150 ms.
var clinfoOnce = sync.OnceValue(runCLInfo)

func runCLInfo() []GPUMemoryInfo {
	if _, err := exec.LookPath("clinfo"); err != nil {
		return nil
	}
	ctx, cancel := context.WithTimeout(context.Background(), clinfoTimeout)
	defer cancel()
	cmd := exec.CommandContext(ctx, "clinfo", "--json")
	var stdout, stderr bytes.Buffer
	cmd.Stdout = &stdout
	cmd.Stderr = &stderr
	if err := cmd.Run(); err != nil {
		xlog.Debug("clinfo failed", "error", err, "stderr", stderr.String())
		return nil
	}
	return parseCLInfoJSON(stdout.Bytes())
}

// getCLInfoGPUMemory is a best-effort fallback for hosts where the
// vendor's own management binary (nvidia-smi / xpu-smi / rocm-smi)
// isn't installed but the OpenCL ICD is. Live used/free aren't exposed
// via standard CL_ properties; we synthesise them by attributing
// per-process VRAM allocations from the kernel DRM fdinfo interface
// to each clinfo-reported GPU via the shared PCI BDF.
func getCLInfoGPUMemory() []GPUMemoryInfo {
	gpus := clinfoOnce()
	if len(gpus) == 0 {
		return nil
	}
	usage := drmFdInfoUsageByBDF()
	for i := range gpus {
		gpus[i] = applyDRMUsage(gpus[i], usage[gpus[i].BDF])
	}
	return gpus
}

// applyDRMUsage stamps live VRAM accounting onto a GPUMemoryInfo
// whose TotalVRAM came from a static source (e.g. clinfo). Caller
// already populated TotalVRAM and FreeVRAM=TotalVRAM as defaults; if
// DRM accounting reports usage, we trust it and rederive free/percent.
func applyDRMUsage(g GPUMemoryInfo, used uint64) GPUMemoryInfo {
	if used == 0 || g.TotalVRAM == 0 {
		return g
	}
	if used > g.TotalVRAM {
		// Process-private DRM total can momentarily exceed device
		// VRAM (over-commit via host memory mirror). Clamp so the UI
		// doesn't display absurd percentages.
		used = g.TotalVRAM
	}
	g.UsedVRAM = used
	g.FreeVRAM = g.TotalVRAM - used
	g.UsagePercent = float64(used) / float64(g.TotalVRAM) * 100
	return g
}

// parseCLInfoJSON returns one GPUMemoryInfo per discrete GPU. UMA
// devices (iGPU/APU) are dropped because their "VRAM" is system RAM
// and would double-count against the capability gate. When the same
// physical device is enumerated by multiple ICDs (Intel OpenCL + POCL,
// for example), the BDF dedup keeps the largest reported size — some
// ICDs cap at 4 GiB for legacy alloc-size compatibility.
func parseCLInfoJSON(raw []byte) []GPUMemoryInfo {
	var out clinfoOutput
	if err := json.Unmarshal(raw, &out); err != nil {
		xlog.Debug("clinfo: failed to parse --json output", "error", err)
		return nil
	}

	byBDF := map[string]GPUMemoryInfo{}
	var noBDF []GPUMemoryInfo

	for _, plat := range out.Devices {
		for _, d := range plat.Online {
			if !d.Type.isGPU() || d.HostUnifiedMemory || d.GlobalMemSize == 0 {
				continue
			}
			bdf := clinfoBDF(d)
			info := GPUMemoryInfo{
				Name:      strings.TrimSpace(d.Name),
				Vendor:    clinfoVendor(d.VendorID, d.Vendor),
				BDF:       bdf,
				TotalVRAM: d.GlobalMemSize,
				FreeVRAM:  d.GlobalMemSize,
			}
			if bdf == "" {
				noBDF = append(noBDF, info)
				continue
			}
			if existing, ok := byBDF[bdf]; !ok || info.TotalVRAM > existing.TotalVRAM {
				byBDF[bdf] = info
			}
		}
	}

	all := make([]GPUMemoryInfo, 0, len(byBDF)+len(noBDF))
	for _, g := range byBDF {
		all = append(all, g)
	}
	all = append(all, noBDF...)
	for i := range all {
		all[i].Index = i
	}
	return all
}

func clinfoVendor(vendorID uint32, name string) string {
	switch vendorID {
	case 0x10de:
		return VendorNVIDIA
	case 0x1002, 0x1022: // 0x1022 is the AMD CPU vendor ID, also reported by some APU OpenCL devices.
		return VendorAMD
	case 0x8086:
		return VendorIntel
	case 0x106B:
		return VendorApple
	}
	n := strings.ToLower(name)
	switch {
	case strings.Contains(n, "nvidia"):
		return VendorNVIDIA
	case strings.Contains(n, "advanced micro devices"), strings.Contains(n, "amd"):
		return VendorAMD
	case strings.Contains(n, "intel"):
		return VendorIntel
	case strings.Contains(n, "apple"):
		return VendorApple
	}
	return VendorUnknown
}

// clinfoBDF returns the device's canonical `dddd:bb:dd.f` PCI address,
// or "" when no PCI location is reported. The KHR form is `"PCI-E,
// 0000:01:00.0"` on NVIDIA and bare `"0000:01:00.0"` on most others.
func clinfoBDF(d clinfoDevice) string {
	if d.PCIBusInfoKHR != "" {
		s := d.PCIBusInfoKHR
		if i := strings.LastIndex(s, " "); i >= 0 {
			s = s[i+1:]
		}
		if c := strings.Count(s, ":"); c == 1 || c == 2 {
			return normalizeBDF(s)
		}
	}
	// NVIDIA pre-KHR per-axis fields. An all-zero result is
	// indistinguishable from "fields absent", but no GPU sits at
	// 0000:00:00.0 so the false negative is harmless.
	if d.PCIBusNV != 0 || d.PCISlotNV != 0 || d.PCIDomainNV != 0 {
		return fmt.Sprintf("%04x:%02x:%02x.0", d.PCIDomainNV, d.PCIBusNV, d.PCISlotNV)
	}
	return ""
}

func normalizeBDF(s string) string {
	if strings.Count(s, ":") == 1 {
		return strings.ToLower("0000:" + s)
	}
	return strings.ToLower(s)
}