From aba0bfd24fec2ebee718337a37814c7ebad398b5 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 19 Jun 2026 20:46:45 +0000 Subject: [PATCH] feat(backend): auto-default physical batch to 2048 on Blackwell GPUs On NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10/DGX Spark) a larger physical batch (n_ubatch) materially lifts MoE prefill throughput - measured on a GB10 with Qwen3-30B-A3B to lift the prefill ceiling and saturate at ~2048. When a model config leaves `batch:` unset, EffectiveBatchSize now picks 2048 on Blackwell instead of 512; explicit `batch:` always overrides. Detection is a shared, cached Go helper (xsysinfo.IsNVIDIABlackwell, nvidia-smi compute_cap >= 12). Logic is isolated in core/backend/hardware_defaults.go and applied at the common ModelOptions builder, so it covers the C++ llama.cpp backend too. Measured (GB10, Qwen3-Coder-30B-A3B MXFP4): prefill ub512 2994 -> ub2048 3316 t/s; saturates past 2048. Also recorded in the DGX gap plan: 4-bit quant alone captures the decode win (Q4_K_M 93.5 >= MXFP4 86.4 t/s), MXFP4's only edge is prefill via Blackwell FP4 tensor cores. Tests: hardware_defaults_internal_test.go; existing NBatch specs pinned to the no-Blackwell branch for determinism. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- .../cpp/llama-cpp/paged/DGX_BLACKWELL_PLAN.md | 19 ++++- core/backend/hardware_defaults.go | 43 +++++++++++ .../hardware_defaults_internal_test.go | 50 +++++++++++++ core/backend/options.go | 5 +- core/backend/options_internal_test.go | 12 +++ pkg/xsysinfo/gpu.go | 75 ++++++++++++++++--- 6 files changed, 191 insertions(+), 13 deletions(-) create mode 100644 core/backend/hardware_defaults.go create mode 100644 core/backend/hardware_defaults_internal_test.go diff --git a/backend/cpp/llama-cpp/paged/DGX_BLACKWELL_PLAN.md b/backend/cpp/llama-cpp/paged/DGX_BLACKWELL_PLAN.md index adb6640a4..c49c95bfa 100644 --- a/backend/cpp/llama-cpp/paged/DGX_BLACKWELL_PLAN.md +++ b/backend/cpp/llama-cpp/paged/DGX_BLACKWELL_PLAN.md @@ -85,9 +85,24 @@ Concurrency (decode-phase aggregate `S_TG`, ub2048), MXFP4 vs Q8 vs vLLM-FP8: **Lever-1 verdict:** MXFP4 is a large, free win — decode +50–66% over Q8, prefill plateau +66% (2200→3650). MXFP4 decode **wins at B=1, near-parity at B=8** vs vLLM; only falls behind at high concurrency. **Prefill still plateaus (~3650)** — the MoE prefill GEMM doesn't scale with batch (no fused grouped GEMM; ubatch-limited). That plateau is the real remaining structural gap → Levers 2–3. Quality caveat unchanged (MXFP4 4-bit vs vLLM FP8 8-bit; quality not yet evaluated). ### Lever 2 — `n_ubatch` / `n_batch` tuning (standalone) -Status: **DONE** +Status: **DONE + SHIPPED (auto-default implemented)** MXFP4 pp4096 vs ubatch: ub512=2994, **ub2048=3316**, ub4096=2820(noisy), ub8192=3180. -**Verdict:** prefill saturates at ub=2048; larger ubatch gives nothing. The ~3300–3650 ceiling is the **MoE GEMM kernel**, not batch size. → No more free config wins; the rest is kernel work (Levers 3–5). Recommendation: ship `n_ubatch=2048` as the LocalAI default for MoE prefill on Blackwell. +**Verdict:** prefill saturates at ub=2048; larger ubatch gives nothing. The ~3300–3650 ceiling is the **MoE GEMM kernel**, not batch size. → No more free config wins; the rest is kernel work (Levers 3–5). +**Implemented:** `core/backend/hardware_defaults.go` — `EffectiveBatchSize` now defaults the physical batch +(n_batch→n_ubatch alias) to **2048 on Blackwell** (`xsysinfo.IsNVIDIABlackwell`, cc≥12 / sm_120/121) when the +config leaves `batch:` unset; explicit `batch:` always wins. Detection is a shared Go helper; placed at the +common ModelOptions builder so it covers the C++ llama.cpp backend too. Tests: `hardware_defaults_internal_test.go`. + +### Lever 1b — Standard Q4 vs MXFP4 (what's actually MXFP4-specific) +**Q4_K_M** (17.3 GiB) vs **MXFP4** (15.9 GiB), ub2048: +| metric | Q4_K_M | MXFP4 | Q8 | +|---|---|---|---| +| decode tg128 | **93.5** | 86.4 | 62.2 | +| prefill pp512 | 2164 | **3061** | 2215 | +| prefill pp2048 | 2953 | **3441** | ~2200 | +**Verdict:** the **decode win is just "4-bit"** — plain Q4_K_M matches/beats MXFP4 on decode (both memory-bound). +MXFP4's *only* real edge is **prefill (+41% over Q4_K_M)** via Blackwell FP4 tensor cores. So for shipping, +**"4-bit quant + ubatch=2048" captures most of the win portably**; MXFP4 is a Blackwell-only prefill extra. ### Lever 3 — Fused FP4/FP8 MoE grouped GEMM (+ activation-quant fusion) Status: **DESIGNED, not built** (multi-week kernel R&D). This is the single biggest remaining prefill win. diff --git a/core/backend/hardware_defaults.go b/core/backend/hardware_defaults.go new file mode 100644 index 000000000..4c915d69a --- /dev/null +++ b/core/backend/hardware_defaults.go @@ -0,0 +1,43 @@ +package backend + +// Hardware-specific backend defaults. +// +// This file centralizes tuning that depends on the *detected hardware* rather +// than on the model config. The model config (explicit `batch:`, `context_size:` +// …) always takes precedence; these helpers only fill values the user left +// unset, so behavior is unchanged unless the matching hardware is present. +// +// Placement note: this runs in the process that builds the gRPC ModelOptions +// sent to every backend (including the C++ llama.cpp grpc-server), so it is the +// one common point that covers all backends. For distributed setups where the +// backend runs on a different host than the orchestrator, worker-side detection +// (e.g. the C++ backend reading cudaGetDeviceProperties) would be more precise; +// this single-host default is the pragmatic common case. + +import ( + "github.com/mudler/LocalAI/pkg/xsysinfo" + "github.com/mudler/xlog" +) + +// BlackwellBatchSize is the physical batch (n_batch/n_ubatch) default on NVIDIA +// Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark). A larger +// physical batch materially lifts MoE prefill throughput there (per-expert GEMM +// tiles fill better); measured on a GB10 with Qwen3-30B-A3B to lift the prefill +// ceiling ~+10-15% and saturate around 2048. Only applied when the model config +// does not set an explicit `batch:`. +const BlackwellBatchSize = 2048 + +// detectBlackwellGPU is a seam over xsysinfo.IsNVIDIABlackwell so tests can +// force the hardware branch deterministically. +var detectBlackwellGPU = xsysinfo.IsNVIDIABlackwell + +// hardwareDefaultBatchSize returns the physical-batch default for the detected +// hardware, falling back to the given value when no hardware-specific tuning +// applies. Used by EffectiveBatchSize only when the config leaves batch unset. +func hardwareDefaultBatchSize(fallback int) int { + if detectBlackwellGPU() { + xlog.Debug("Blackwell GPU detected; defaulting physical batch higher for MoE prefill", "batch", BlackwellBatchSize) + return BlackwellBatchSize + } + return fallback +} diff --git a/core/backend/hardware_defaults_internal_test.go b/core/backend/hardware_defaults_internal_test.go new file mode 100644 index 000000000..df621cded --- /dev/null +++ b/core/backend/hardware_defaults_internal_test.go @@ -0,0 +1,50 @@ +package backend + +import ( + "github.com/mudler/LocalAI/core/config" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("hardware-specific defaults", func() { + var origDetect func() bool + + BeforeEach(func() { + origDetect = detectBlackwellGPU + }) + AfterEach(func() { + detectBlackwellGPU = origDetect + }) + + Describe("hardwareDefaultBatchSize", func() { + It("returns the fallback when not Blackwell", func() { + detectBlackwellGPU = func() bool { return false } + Expect(hardwareDefaultBatchSize(512)).To(Equal(512)) + }) + + It("returns BlackwellBatchSize on Blackwell", func() { + detectBlackwellGPU = func() bool { return true } + Expect(hardwareDefaultBatchSize(512)).To(Equal(BlackwellBatchSize)) + }) + }) + + Describe("EffectiveBatchSize on Blackwell", func() { + threads := 1 + ctx := 4096 + + It("defaults an unset batch to 2048 on Blackwell", func() { + detectBlackwellGPU = func() bool { return true } + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}} + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(BlackwellBatchSize)) + }) + + It("keeps an explicit batch over the Blackwell default", func() { + detectBlackwellGPU = func() bool { return true } + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}} + cfg.Batch = 256 + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(256)) + }) + }) +}) diff --git a/core/backend/options.go b/core/backend/options.go index efe6c649f..d66b55049 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -122,7 +122,10 @@ func EffectiveBatchSize(c config.ModelConfig) int { if ctx := EffectiveContextSize(c); singlePass && ctx > DefaultBatchSize { return ctx } - return DefaultBatchSize + // Hardware-tuned default when the config leaves batch unset (e.g. a larger + // physical batch lifts MoE prefill on Blackwell). Explicit `batch:` (handled + // above) always overrides this. See hardware_defaults.go. + return hardwareDefaultBatchSize(DefaultBatchSize) } func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { diff --git a/core/backend/options_internal_test.go b/core/backend/options_internal_test.go index 022d7b1d9..7c5b3dad6 100644 --- a/core/backend/options_internal_test.go +++ b/core/backend/options_internal_test.go @@ -103,6 +103,18 @@ var _ = Describe("grpcModelOpts NBatch", func() { threads := 1 ctx := 4096 + // Pin the hardware seam off so these baseline expectations are + // deterministic regardless of the host GPU. Blackwell behavior is covered + // in hardware_defaults_internal_test.go. + var origDetect func() bool + BeforeEach(func() { + origDetect = detectBlackwellGPU + detectBlackwellGPU = func() bool { return false } + }) + AfterEach(func() { + detectBlackwellGPU = origDetect + }) + It("defaults to 512 for an ordinary model", func() { cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}} opts := grpcModelOpts(cfg, "/tmp/models") diff --git a/pkg/xsysinfo/gpu.go b/pkg/xsysinfo/gpu.go index a5575edb8..5cf7a2a9f 100644 --- a/pkg/xsysinfo/gpu.go +++ b/pkg/xsysinfo/gpu.go @@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{ // GPUMemoryInfo contains real-time GPU memory usage information type GPUMemoryInfo struct { - Index int `json:"index"` - Name string `json:"name"` - Vendor string `json:"vendor"` + Index int `json:"index"` + Name string `json:"name"` + Vendor string `json:"vendor"` // BDF is the canonical PCI bus address (dddd:bb:dd.f) when known. // Populated by detection paths that can attribute the device to a // PCI location (clinfo, future amdgpu/nvidia paths); empty for @@ -307,6 +307,61 @@ func GetGPUAggregateInfo() GPUAggregateInfo { return aggregate } +var ( + blackwellOnce sync.Once + blackwellResult bool +) + +// IsNVIDIABlackwell reports whether an NVIDIA Blackwell-class consumer GPU is +// present, i.e. compute capability 12.x (sm_120 RTX 50-series, sm_121 GB10 / +// DGX Spark). The result is detected once via nvidia-smi and cached. +// +// Note: datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0) reports a +// different compute capability and is intentionally NOT matched here — this +// targets the sm_12x family where we measured the larger-physical-batch MoE +// prefill win. Returns false when nvidia-smi is unavailable or reports no 12.x +// device. +func IsNVIDIABlackwell() bool { + blackwellOnce.Do(func() { + blackwellResult = detectNVIDIABlackwell() + }) + return blackwellResult +} + +func detectNVIDIABlackwell() bool { + if _, err := exec.LookPath("nvidia-smi"); err != nil { + return false + } + + cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader") + + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String()) + return false + } + + for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + // compute_cap looks like "12.1"; match major version >= 12 (sm_12x). + major := line + if dot := strings.IndexByte(line, '.'); dot >= 0 { + major = line[:dot] + } + if m, err := strconv.Atoi(major); err == nil && m >= 12 { + xlog.Debug("NVIDIA Blackwell-class GPU detected", "compute_cap", line) + return true + } + } + return false +} + // getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi func getNVIDIAGPUMemory() []GPUMemoryInfo { // Check if nvidia-smi is available @@ -866,12 +921,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo { } type vulkanGPUTextInfo struct { - index int - name string - deviceType string - totalVRAM uint64 - budgetVRAM uint64 - usageVRAM uint64 + index int + name string + deviceType string + totalVRAM uint64 + budgetVRAM uint64 + usageVRAM uint64 } func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo { @@ -909,7 +964,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo { } else if current.usageVRAM != 0 && current.budgetVRAM == 0 { current.budgetVRAM = current.totalVRAM - current.usageVRAM } else if current.usageVRAM == 0 && current.budgetVRAM == 0 { - current.usageVRAM = 0 + current.usageVRAM = 0 current.budgetVRAM = current.totalVRAM }