feat(backend): auto-default physical batch to 2048 on Blackwell GPUs

On NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10/DGX Spark) a larger physical batch (n_ubatch) materially lifts MoE prefill throughput - measured on a GB10 with Qwen3-30B-A3B to lift the prefill ceiling and saturate at ~2048. When a model config leaves `batch:` unset, EffectiveBatchSize now picks 2048 on Blackwell instead of 512; explicit `batch:` always overrides. Detection is a shared, cached Go helper (xsysinfo.IsNVIDIABlackwell, nvidia-smi compute_cap >= 12). Logic is isolated in core/backend/hardware_defaults.go and applied at the common ModelOptions builder, so it covers the C++ llama.cpp backend too. Measured (GB10, Qwen3-Coder-30B-A3B MXFP4): prefill ub512 2994 -> ub2048 3316 t/s; saturates past 2048. Also recorded in the DGX gap plan: 4-bit quant alone captures the decode win (Q4_K_M 93.5 >= MXFP4 86.4 t/s), MXFP4's only edge is prefill via Blackwell FP4 tensor cores. Tests: hardware_defaults_internal_test.go; existing NBatch specs pinned to the no-Blackwell branch for determinism. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-23 16:19:07 -04:00 · 2026-06-19 20:46:45 +00:00
parent 7aa61d4c32
commit aba0bfd24f
6 changed files with 191 additions and 13 deletions
--- a/pkg/xsysinfo/gpu.go
+++ b/pkg/xsysinfo/gpu.go
@@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{

 // GPUMemoryInfo contains real-time GPU memory usage information
 type GPUMemoryInfo struct {
-	Index        int     `json:"index"`
-	Name         string  `json:"name"`
-	Vendor       string  `json:"vendor"`
+	Index  int    `json:"index"`
+	Name   string `json:"name"`
+	Vendor string `json:"vendor"`
 	// BDF is the canonical PCI bus address (dddd:bb:dd.f) when known.
 	// Populated by detection paths that can attribute the device to a
 	// PCI location (clinfo, future amdgpu/nvidia paths); empty for
@@ -307,6 +307,61 @@ func GetGPUAggregateInfo() GPUAggregateInfo {
 	return aggregate
 }

+var (
+	blackwellOnce   sync.Once
+	blackwellResult bool
+)
+
+// IsNVIDIABlackwell reports whether an NVIDIA Blackwell-class consumer GPU is
+// present, i.e. compute capability 12.x (sm_120 RTX 50-series, sm_121 GB10 /
+// DGX Spark). The result is detected once via nvidia-smi and cached.
+//
+// Note: datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0) reports a
+// different compute capability and is intentionally NOT matched here — this
+// targets the sm_12x family where we measured the larger-physical-batch MoE
+// prefill win. Returns false when nvidia-smi is unavailable or reports no 12.x
+// device.
+func IsNVIDIABlackwell() bool {
+	blackwellOnce.Do(func() {
+		blackwellResult = detectNVIDIABlackwell()
+	})
+	return blackwellResult
+}
+
+func detectNVIDIABlackwell() bool {
+	if _, err := exec.LookPath("nvidia-smi"); err != nil {
+		return false
+	}
+
+	cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader")
+
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	if err := cmd.Run(); err != nil {
+		xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String())
+		return false
+	}
+
+	for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		// compute_cap looks like "12.1"; match major version >= 12 (sm_12x).
+		major := line
+		if dot := strings.IndexByte(line, '.'); dot >= 0 {
+			major = line[:dot]
+		}
+		if m, err := strconv.Atoi(major); err == nil && m >= 12 {
+			xlog.Debug("NVIDIA Blackwell-class GPU detected", "compute_cap", line)
+			return true
+		}
+	}
+	return false
+}
+
 // getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi
 func getNVIDIAGPUMemory() []GPUMemoryInfo {
 	// Check if nvidia-smi is available
@@ -866,12 +921,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo {
 }

 type vulkanGPUTextInfo struct {
-	index        int
-	name         string
-	deviceType   string
-	totalVRAM    uint64
-	budgetVRAM   uint64
-	usageVRAM    uint64
+	index      int
+	name       string
+	deviceType string
+	totalVRAM  uint64
+	budgetVRAM uint64
+	usageVRAM  uint64
 }

 func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
@@ -909,7 +964,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
 		} else if current.usageVRAM != 0 && current.budgetVRAM == 0 {
 			current.budgetVRAM = current.totalVRAM - current.usageVRAM
 		} else if current.usageVRAM == 0 && current.budgetVRAM == 0 {
-			current.usageVRAM  = 0
+			current.usageVRAM = 0
 			current.budgetVRAM = current.totalVRAM
 		}