mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-23 16:19:07 -04:00
feat(backend): auto-default physical batch to 2048 on Blackwell GPUs
On NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10/DGX Spark) a larger physical batch (n_ubatch) materially lifts MoE prefill throughput - measured on a GB10 with Qwen3-30B-A3B to lift the prefill ceiling and saturate at ~2048. When a model config leaves `batch:` unset, EffectiveBatchSize now picks 2048 on Blackwell instead of 512; explicit `batch:` always overrides. Detection is a shared, cached Go helper (xsysinfo.IsNVIDIABlackwell, nvidia-smi compute_cap >= 12). Logic is isolated in core/backend/hardware_defaults.go and applied at the common ModelOptions builder, so it covers the C++ llama.cpp backend too. Measured (GB10, Qwen3-Coder-30B-A3B MXFP4): prefill ub512 2994 -> ub2048 3316 t/s; saturates past 2048. Also recorded in the DGX gap plan: 4-bit quant alone captures the decode win (Q4_K_M 93.5 >= MXFP4 86.4 t/s), MXFP4's only edge is prefill via Blackwell FP4 tensor cores. Tests: hardware_defaults_internal_test.go; existing NBatch specs pinned to the no-Blackwell branch for determinism. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{
|
||||
|
||||
// GPUMemoryInfo contains real-time GPU memory usage information
|
||||
type GPUMemoryInfo struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
Vendor string `json:"vendor"`
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
Vendor string `json:"vendor"`
|
||||
// BDF is the canonical PCI bus address (dddd:bb:dd.f) when known.
|
||||
// Populated by detection paths that can attribute the device to a
|
||||
// PCI location (clinfo, future amdgpu/nvidia paths); empty for
|
||||
@@ -307,6 +307,61 @@ func GetGPUAggregateInfo() GPUAggregateInfo {
|
||||
return aggregate
|
||||
}
|
||||
|
||||
var (
|
||||
blackwellOnce sync.Once
|
||||
blackwellResult bool
|
||||
)
|
||||
|
||||
// IsNVIDIABlackwell reports whether an NVIDIA Blackwell-class consumer GPU is
|
||||
// present, i.e. compute capability 12.x (sm_120 RTX 50-series, sm_121 GB10 /
|
||||
// DGX Spark). The result is detected once via nvidia-smi and cached.
|
||||
//
|
||||
// Note: datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0) reports a
|
||||
// different compute capability and is intentionally NOT matched here — this
|
||||
// targets the sm_12x family where we measured the larger-physical-batch MoE
|
||||
// prefill win. Returns false when nvidia-smi is unavailable or reports no 12.x
|
||||
// device.
|
||||
func IsNVIDIABlackwell() bool {
|
||||
blackwellOnce.Do(func() {
|
||||
blackwellResult = detectNVIDIABlackwell()
|
||||
})
|
||||
return blackwellResult
|
||||
}
|
||||
|
||||
func detectNVIDIABlackwell() bool {
|
||||
if _, err := exec.LookPath("nvidia-smi"); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader")
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String())
|
||||
return false
|
||||
}
|
||||
|
||||
for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
// compute_cap looks like "12.1"; match major version >= 12 (sm_12x).
|
||||
major := line
|
||||
if dot := strings.IndexByte(line, '.'); dot >= 0 {
|
||||
major = line[:dot]
|
||||
}
|
||||
if m, err := strconv.Atoi(major); err == nil && m >= 12 {
|
||||
xlog.Debug("NVIDIA Blackwell-class GPU detected", "compute_cap", line)
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi
|
||||
func getNVIDIAGPUMemory() []GPUMemoryInfo {
|
||||
// Check if nvidia-smi is available
|
||||
@@ -866,12 +921,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo {
|
||||
}
|
||||
|
||||
type vulkanGPUTextInfo struct {
|
||||
index int
|
||||
name string
|
||||
deviceType string
|
||||
totalVRAM uint64
|
||||
budgetVRAM uint64
|
||||
usageVRAM uint64
|
||||
index int
|
||||
name string
|
||||
deviceType string
|
||||
totalVRAM uint64
|
||||
budgetVRAM uint64
|
||||
usageVRAM uint64
|
||||
}
|
||||
|
||||
func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
|
||||
@@ -909,7 +964,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
|
||||
} else if current.usageVRAM != 0 && current.budgetVRAM == 0 {
|
||||
current.budgetVRAM = current.totalVRAM - current.usageVRAM
|
||||
} else if current.usageVRAM == 0 && current.budgetVRAM == 0 {
|
||||
current.usageVRAM = 0
|
||||
current.usageVRAM = 0
|
||||
current.budgetVRAM = current.totalVRAM
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user