feat(config): node-aware hardware defaults — larger physical batch on Blackwell

A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 -> ub2048 ~3316 t/s) and saturates around 2048. The heuristic lives in core/config alongside the other config overriders (ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the ModelConfig from heuristics, so hardware tuning is the same domain and stays in one place. It is parameterized on a GPU descriptor (not direct detection) so it works in both deployment shapes: - Single host: SetDefaults applies it with the LocalGPU. - Distributed: only the worker sees the GPU, so the worker reports its compute capability on registration (gpu_compute_capability -> BackendNode), and the router re-applies the SAME core/config heuristic for the SELECTED node before loading — fixing the case where the frontend has no GPU at all. Explicit `batch:` always wins (only managed default values are touched). xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-20 06:39:01 -04:00 · 2026-06-19 22:02:14 +00:00
parent 079ac0e15a
commit bca250e2bd
10 changed files with 390 additions and 32 deletions
--- a/pkg/xsysinfo/computecap_internal_test.go
+++ b/pkg/xsysinfo/computecap_internal_test.go
@@ -0,0 +1,23 @@
+package xsysinfo
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("parseComputeCap", func() {
+	DescribeTable("splits major.minor",
+		func(in string, maj, min int) {
+			m, n := parseComputeCap(in)
+			Expect(m).To(Equal(maj))
+			Expect(n).To(Equal(min))
+		},
+		Entry("GB10 / DGX Spark", "12.1", 12, 1),
+		Entry("RTX 50-series", "12.0", 12, 0),
+		Entry("Hopper", "9.0", 9, 0),
+		Entry("major only", "12", 12, 0),
+		Entry("whitespace", " 12.1 ", 12, 1),
+		Entry("empty", "", -1, -1),
+		Entry("garbage", "abc", -1, -1),
+	)
+})
--- a/pkg/xsysinfo/gpu.go
+++ b/pkg/xsysinfo/gpu.go
@@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{

 // GPUMemoryInfo contains real-time GPU memory usage information
 type GPUMemoryInfo struct {
-	Index        int     `json:"index"`
-	Name         string  `json:"name"`
-	Vendor       string  `json:"vendor"`
+	Index  int    `json:"index"`
+	Name   string `json:"name"`
+	Vendor string `json:"vendor"`
 	// BDF is the canonical PCI bus address (dddd:bb:dd.f) when known.
 	// Populated by detection paths that can attribute the device to a
 	// PCI location (clinfo, future amdgpu/nvidia paths); empty for
@@ -307,6 +307,84 @@ func GetGPUAggregateInfo() GPUAggregateInfo {
 	return aggregate
 }

+var (
+	computeCapOnce   sync.Once
+	computeCapResult string
+)
+
+// NVIDIAComputeCapability returns the highest NVIDIA GPU compute capability on
+// this host as a "major.minor" string (e.g. "12.1" for GB10 / DGX Spark), or ""
+// when nvidia-smi is unavailable or reports none. Detected once and cached.
+//
+// This runs where the GPU actually is. In distributed mode it is reported by
+// each worker on registration so the router can make per-node decisions rather
+// than guessing from the (possibly GPU-less) frontend host.
+func NVIDIAComputeCapability() string {
+	computeCapOnce.Do(func() {
+		computeCapResult = detectNVIDIAComputeCapability()
+	})
+	return computeCapResult
+}
+
+func detectNVIDIAComputeCapability() string {
+	if _, err := exec.LookPath("nvidia-smi"); err != nil {
+		return ""
+	}
+
+	cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader")
+
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	if err := cmd.Run(); err != nil {
+		xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String())
+		return ""
+	}
+
+	best := ""
+	bestMajor, bestMinor := -1, -1
+	for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		maj, min := parseComputeCap(line)
+		if maj < 0 {
+			continue
+		}
+		if maj > bestMajor || (maj == bestMajor && min > bestMinor) {
+			bestMajor, bestMinor, best = maj, min, line
+		}
+	}
+	if best != "" {
+		xlog.Debug("NVIDIA compute capability detected", "compute_cap", best)
+	}
+	return best
+}
+
+// parseComputeCap splits a "major.minor" compute-capability string into its
+// integer parts. Returns (-1, -1) if it can't be parsed.
+func parseComputeCap(cc string) (int, int) {
+	cc = strings.TrimSpace(cc)
+	if cc == "" {
+		return -1, -1
+	}
+	majStr, minStr := cc, "0"
+	if dot := strings.IndexByte(cc, '.'); dot >= 0 {
+		majStr, minStr = cc[:dot], cc[dot+1:]
+	}
+	maj, err := strconv.Atoi(strings.TrimSpace(majStr))
+	if err != nil {
+		return -1, -1
+	}
+	min, err := strconv.Atoi(strings.TrimSpace(minStr))
+	if err != nil {
+		min = 0
+	}
+	return maj, min
+}
+
 // getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi
 func getNVIDIAGPUMemory() []GPUMemoryInfo {
 	// Check if nvidia-smi is available
@@ -866,12 +944,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo {
 }

 type vulkanGPUTextInfo struct {
-	index        int
-	name         string
-	deviceType   string
-	totalVRAM    uint64
-	budgetVRAM   uint64
-	usageVRAM    uint64
+	index      int
+	name       string
+	deviceType string
+	totalVRAM  uint64
+	budgetVRAM uint64
+	usageVRAM  uint64
 }

 func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
@@ -909,7 +987,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
 		} else if current.usageVRAM != 0 && current.budgetVRAM == 0 {
 			current.budgetVRAM = current.totalVRAM - current.usageVRAM
 		} else if current.usageVRAM == 0 && current.budgetVRAM == 0 {
-			current.usageVRAM  = 0
+			current.usageVRAM = 0
 			current.budgetVRAM = current.totalVRAM
 		}