feat(backend): auto-default physical batch to 2048 on Blackwell GPUs

On NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10/DGX Spark) a larger
physical batch (n_ubatch) materially lifts MoE prefill throughput - measured on
a GB10 with Qwen3-30B-A3B to lift the prefill ceiling and saturate at ~2048.

When a model config leaves `batch:` unset, EffectiveBatchSize now picks 2048 on
Blackwell instead of 512; explicit `batch:` always overrides. Detection is a
shared, cached Go helper (xsysinfo.IsNVIDIABlackwell, nvidia-smi compute_cap
>= 12). Logic is isolated in core/backend/hardware_defaults.go and applied at
the common ModelOptions builder, so it covers the C++ llama.cpp backend too.

Measured (GB10, Qwen3-Coder-30B-A3B MXFP4): prefill ub512 2994 -> ub2048 3316
t/s; saturates past 2048. Also recorded in the DGX gap plan: 4-bit quant alone
captures the decode win (Q4_K_M 93.5 >= MXFP4 86.4 t/s), MXFP4's only edge is
prefill via Blackwell FP4 tensor cores.

Tests: hardware_defaults_internal_test.go; existing NBatch specs pinned to the
no-Blackwell branch for determinism.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-06-19 20:46:45 +00:00
parent 7aa61d4c32
commit aba0bfd24f
6 changed files with 191 additions and 13 deletions

View File

@@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{
// GPUMemoryInfo contains real-time GPU memory usage information
type GPUMemoryInfo struct {
Index int `json:"index"`
Name string `json:"name"`
Vendor string `json:"vendor"`
Index int `json:"index"`
Name string `json:"name"`
Vendor string `json:"vendor"`
// BDF is the canonical PCI bus address (dddd:bb:dd.f) when known.
// Populated by detection paths that can attribute the device to a
// PCI location (clinfo, future amdgpu/nvidia paths); empty for
@@ -307,6 +307,61 @@ func GetGPUAggregateInfo() GPUAggregateInfo {
return aggregate
}
var (
blackwellOnce sync.Once
blackwellResult bool
)
// IsNVIDIABlackwell reports whether an NVIDIA Blackwell-class consumer GPU is
// present, i.e. compute capability 12.x (sm_120 RTX 50-series, sm_121 GB10 /
// DGX Spark). The result is detected once via nvidia-smi and cached.
//
// Note: datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0) reports a
// different compute capability and is intentionally NOT matched here — this
// targets the sm_12x family where we measured the larger-physical-batch MoE
// prefill win. Returns false when nvidia-smi is unavailable or reports no 12.x
// device.
func IsNVIDIABlackwell() bool {
blackwellOnce.Do(func() {
blackwellResult = detectNVIDIABlackwell()
})
return blackwellResult
}
func detectNVIDIABlackwell() bool {
if _, err := exec.LookPath("nvidia-smi"); err != nil {
return false
}
cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String())
return false
}
for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
// compute_cap looks like "12.1"; match major version >= 12 (sm_12x).
major := line
if dot := strings.IndexByte(line, '.'); dot >= 0 {
major = line[:dot]
}
if m, err := strconv.Atoi(major); err == nil && m >= 12 {
xlog.Debug("NVIDIA Blackwell-class GPU detected", "compute_cap", line)
return true
}
}
return false
}
// getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi
func getNVIDIAGPUMemory() []GPUMemoryInfo {
// Check if nvidia-smi is available
@@ -866,12 +921,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo {
}
type vulkanGPUTextInfo struct {
index int
name string
deviceType string
totalVRAM uint64
budgetVRAM uint64
usageVRAM uint64
index int
name string
deviceType string
totalVRAM uint64
budgetVRAM uint64
usageVRAM uint64
}
func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
@@ -909,7 +964,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
} else if current.usageVRAM != 0 && current.budgetVRAM == 0 {
current.budgetVRAM = current.totalVRAM - current.usageVRAM
} else if current.usageVRAM == 0 && current.budgetVRAM == 0 {
current.usageVRAM = 0
current.usageVRAM = 0
current.budgetVRAM = current.totalVRAM
}