feat(backend): auto-default physical batch to 2048 on Blackwell GPUs

On NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10/DGX Spark) a larger physical batch (n_ubatch) materially lifts MoE prefill throughput - measured on a GB10 with Qwen3-30B-A3B to lift the prefill ceiling and saturate at ~2048. When a model config leaves `batch:` unset, EffectiveBatchSize now picks 2048 on Blackwell instead of 512; explicit `batch:` always overrides. Detection is a shared, cached Go helper (xsysinfo.IsNVIDIABlackwell, nvidia-smi compute_cap >= 12). Logic is isolated in core/backend/hardware_defaults.go and applied at the common ModelOptions builder, so it covers the C++ llama.cpp backend too. Measured (GB10, Qwen3-Coder-30B-A3B MXFP4): prefill ub512 2994 -> ub2048 3316 t/s; saturates past 2048. Also recorded in the DGX gap plan: 4-bit quant alone captures the decode win (Q4_K_M 93.5 >= MXFP4 86.4 t/s), MXFP4's only edge is prefill via Blackwell FP4 tensor cores. Tests: hardware_defaults_internal_test.go; existing NBatch specs pinned to the no-Blackwell branch for determinism. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-23 16:19:07 -04:00 · 2026-06-19 20:46:45 +00:00
parent 7aa61d4c32
commit aba0bfd24f
6 changed files with 191 additions and 13 deletions
--- a/core/backend/hardware_defaults.go
+++ b/core/backend/hardware_defaults.go
@@ -0,0 +1,43 @@
+package backend
+
+// Hardware-specific backend defaults.
+//
+// This file centralizes tuning that depends on the *detected hardware* rather
+// than on the model config. The model config (explicit `batch:`, `context_size:`
+// …) always takes precedence; these helpers only fill values the user left
+// unset, so behavior is unchanged unless the matching hardware is present.
+//
+// Placement note: this runs in the process that builds the gRPC ModelOptions
+// sent to every backend (including the C++ llama.cpp grpc-server), so it is the
+// one common point that covers all backends. For distributed setups where the
+// backend runs on a different host than the orchestrator, worker-side detection
+// (e.g. the C++ backend reading cudaGetDeviceProperties) would be more precise;
+// this single-host default is the pragmatic common case.
+
+import (
+	"github.com/mudler/LocalAI/pkg/xsysinfo"
+	"github.com/mudler/xlog"
+)
+
+// BlackwellBatchSize is the physical batch (n_batch/n_ubatch) default on NVIDIA
+// Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark). A larger
+// physical batch materially lifts MoE prefill throughput there (per-expert GEMM
+// tiles fill better); measured on a GB10 with Qwen3-30B-A3B to lift the prefill
+// ceiling ~+10-15% and saturate around 2048. Only applied when the model config
+// does not set an explicit `batch:`.
+const BlackwellBatchSize = 2048
+
+// detectBlackwellGPU is a seam over xsysinfo.IsNVIDIABlackwell so tests can
+// force the hardware branch deterministically.
+var detectBlackwellGPU = xsysinfo.IsNVIDIABlackwell
+
+// hardwareDefaultBatchSize returns the physical-batch default for the detected
+// hardware, falling back to the given value when no hardware-specific tuning
+// applies. Used by EffectiveBatchSize only when the config leaves batch unset.
+func hardwareDefaultBatchSize(fallback int) int {
+	if detectBlackwellGPU() {
+		xlog.Debug("Blackwell GPU detected; defaulting physical batch higher for MoE prefill", "batch", BlackwellBatchSize)
+		return BlackwellBatchSize
+	}
+	return fallback
+}
--- a/core/backend/hardware_defaults_internal_test.go
+++ b/core/backend/hardware_defaults_internal_test.go
@@ -0,0 +1,50 @@
+package backend
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("hardware-specific defaults", func() {
+	var origDetect func() bool
+
+	BeforeEach(func() {
+		origDetect = detectBlackwellGPU
+	})
+	AfterEach(func() {
+		detectBlackwellGPU = origDetect
+	})
+
+	Describe("hardwareDefaultBatchSize", func() {
+		It("returns the fallback when not Blackwell", func() {
+			detectBlackwellGPU = func() bool { return false }
+			Expect(hardwareDefaultBatchSize(512)).To(Equal(512))
+		})
+
+		It("returns BlackwellBatchSize on Blackwell", func() {
+			detectBlackwellGPU = func() bool { return true }
+			Expect(hardwareDefaultBatchSize(512)).To(Equal(BlackwellBatchSize))
+		})
+	})
+
+	Describe("EffectiveBatchSize on Blackwell", func() {
+		threads := 1
+		ctx := 4096
+
+		It("defaults an unset batch to 2048 on Blackwell", func() {
+			detectBlackwellGPU = func() bool { return true }
+			cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}}
+			opts := grpcModelOpts(cfg, "/tmp/models")
+			Expect(opts.NBatch).To(BeEquivalentTo(BlackwellBatchSize))
+		})
+
+		It("keeps an explicit batch over the Blackwell default", func() {
+			detectBlackwellGPU = func() bool { return true }
+			cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}}
+			cfg.Batch = 256
+			opts := grpcModelOpts(cfg, "/tmp/models")
+			Expect(opts.NBatch).To(BeEquivalentTo(256))
+		})
+	})
+})
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -122,7 +122,10 @@ func EffectiveBatchSize(c config.ModelConfig) int {
 	if ctx := EffectiveContextSize(c); singlePass && ctx > DefaultBatchSize {
 		return ctx
 	}
-	return DefaultBatchSize
+	// Hardware-tuned default when the config leaves batch unset (e.g. a larger
+	// physical batch lifts MoE prefill on Blackwell). Explicit `batch:` (handled
+	// above) always overrides this. See hardware_defaults.go.
+	return hardwareDefaultBatchSize(DefaultBatchSize)
 }

 func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
--- a/core/backend/options_internal_test.go
+++ b/core/backend/options_internal_test.go
@@ -103,6 +103,18 @@ var _ = Describe("grpcModelOpts NBatch", func() {
 	threads := 1
 	ctx := 4096

+	// Pin the hardware seam off so these baseline expectations are
+	// deterministic regardless of the host GPU. Blackwell behavior is covered
+	// in hardware_defaults_internal_test.go.
+	var origDetect func() bool
+	BeforeEach(func() {
+		origDetect = detectBlackwellGPU
+		detectBlackwellGPU = func() bool { return false }
+	})
+	AfterEach(func() {
+		detectBlackwellGPU = origDetect
+	})
+
 	It("defaults to 512 for an ordinary model", func() {
 		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}}
 		opts := grpcModelOpts(cfg, "/tmp/models")