Files
LocalAI/core/config/hardware_defaults_test.go
LocalAI [bot] b081247d95 feat(config): hardware-tuned defaults — Blackwell batch + VRAM-scaled concurrency (#10411)
* feat(config): node-aware hardware defaults — larger physical batch on Blackwell

A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on
NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured
on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 ->
ub2048 ~3316 t/s) and saturates around 2048.

The heuristic lives in core/config alongside the other config overriders
(ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the
ModelConfig from heuristics, so hardware tuning is the same domain and stays in
one place. It is parameterized on a GPU descriptor (not direct detection) so it
works in both deployment shapes:

- Single host: SetDefaults applies it with the LocalGPU.
- Distributed: only the worker sees the GPU, so the worker reports its compute
  capability on registration (gpu_compute_capability -> BackendNode), and the
  router re-applies the SAME core/config heuristic for the SELECTED node before
  loading — fixing the case where the frontend has no GPU at all.

Explicit `batch:` always wins (only managed default values are touched).
xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation
lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* test(config): injectable local-GPU seam + single-instance coverage

Make local GPU detection an injectable package var (localGPU) so the
single-instance path (SetDefaults -> ApplyHardwareDefaults) is deterministically
testable without a real GPU, mirroring the distributed override's coverage.
Adds specs asserting SetDefaults sets the Blackwell physical batch, leaves it
unset on non-Blackwell, and never overrides an explicit batch.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(config): default concurrent serving (n_parallel) by GPU VRAM

The llama.cpp backend defaults n_parallel=1, which serializes multi-user requests
and leaves continuous batching off (it auto-enables only at n_parallel>1). Fold a
VRAM-scaled parallel-slot default into the hardware-config path so multi-user
serving works out of the box: >=32GiB->8, >=8GiB->4, >=4GiB->2, else unchanged.
With the backend's unified KV the slots SHARE the context budget, so this adds
concurrency without multiplying KV memory. Explicit parallel/n_parallel always
wins. EnsureParallelOption is shared by the single-host path (ApplyHardwareDefaults
with the local GPU) and the distributed router (per selected node's reported VRAM,
since the frontend may have no GPU). LocalGPU now also reports VRAM.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-20 14:45:59 +02:00

98 lines
3.4 KiB
Go

package config_test
import (
. "github.com/mudler/LocalAI/core/config"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("Hardware-driven config defaults", func() {
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
func(cc string, want bool) {
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
},
Entry("GB10 12.1", "12.1", true),
Entry("RTX 50 12.0", "12.0", true),
Entry("future 13.0", "13.0", true),
Entry("Hopper 9.0", "9.0", false),
Entry("Ada 8.9", "8.9", false),
Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
Entry("unknown", "", false),
)
Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
It("returns the Blackwell batch on Blackwell", func() {
Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
})
It("returns the default batch otherwise", func() {
Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
})
It("recognizes managed defaults but not explicit values", func() {
Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
})
})
Describe("ApplyHardwareDefaults", func() {
It("raises an unset batch to 2048 on Blackwell", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
})
It("leaves batch unset on non-Blackwell", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
Expect(cfg.Batch).To(Equal(0))
})
It("never overrides an explicit batch", func() {
cfg := &ModelConfig{}
cfg.Batch = 1024
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
Expect(cfg.Batch).To(Equal(1024))
})
It("no-ops on nil", func() {
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
})
})
const gib = uint64(1) << 30
DescribeTable("DefaultParallelSlots (by VRAM)",
func(vramGiB uint64, want int) {
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
},
Entry("GB10 119 GiB", uint64(119), 8),
Entry("48 GiB", uint64(48), 8),
Entry("24 GiB", uint64(24), 4),
Entry("8 GiB", uint64(8), 4),
Entry("6 GiB", uint64(6), 2),
Entry("2 GiB", uint64(2), 1),
Entry("unknown 0", uint64(0), 1),
)
Describe("ApplyHardwareDefaults parallel slots", func() {
It("adds a VRAM-scaled parallel option on a capable GPU", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
Expect(cfg.Options).To(ContainElement("parallel:8"))
})
It("scales the slot count down with VRAM", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
Expect(cfg.Options).To(ContainElement("parallel:4"))
})
It("adds no parallel option on small/unknown VRAM", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
})
It("never overrides an explicit parallel option", func() {
cfg := &ModelConfig{Options: []string{"parallel:2"}}
ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
})
})
})