mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-22 15:49:12 -04:00
* feat(config): node-aware hardware defaults — larger physical batch on Blackwell A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 -> ub2048 ~3316 t/s) and saturates around 2048. The heuristic lives in core/config alongside the other config overriders (ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the ModelConfig from heuristics, so hardware tuning is the same domain and stays in one place. It is parameterized on a GPU descriptor (not direct detection) so it works in both deployment shapes: - Single host: SetDefaults applies it with the LocalGPU. - Distributed: only the worker sees the GPU, so the worker reports its compute capability on registration (gpu_compute_capability -> BackendNode), and the router re-applies the SAME core/config heuristic for the SELECTED node before loading — fixing the case where the frontend has no GPU at all. Explicit `batch:` always wins (only managed default values are touched). xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * test(config): injectable local-GPU seam + single-instance coverage Make local GPU detection an injectable package var (localGPU) so the single-instance path (SetDefaults -> ApplyHardwareDefaults) is deterministically testable without a real GPU, mirroring the distributed override's coverage. Adds specs asserting SetDefaults sets the Blackwell physical batch, leaves it unset on non-Blackwell, and never overrides an explicit batch. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(config): default concurrent serving (n_parallel) by GPU VRAM The llama.cpp backend defaults n_parallel=1, which serializes multi-user requests and leaves continuous batching off (it auto-enables only at n_parallel>1). Fold a VRAM-scaled parallel-slot default into the hardware-config path so multi-user serving works out of the box: >=32GiB->8, >=8GiB->4, >=4GiB->2, else unchanged. With the backend's unified KV the slots SHARE the context budget, so this adds concurrency without multiplying KV memory. Explicit parallel/n_parallel always wins. EnsureParallelOption is shared by the single-host path (ApplyHardwareDefaults with the local GPU) and the distributed router (per selected node's reported VRAM, since the frontend may have no GPU). LocalGPU now also reports VRAM. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
98 lines
3.4 KiB
Go
98 lines
3.4 KiB
Go
package config_test
|
|
|
|
import (
|
|
. "github.com/mudler/LocalAI/core/config"
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
var _ = Describe("Hardware-driven config defaults", func() {
|
|
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
|
|
func(cc string, want bool) {
|
|
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
|
|
},
|
|
Entry("GB10 12.1", "12.1", true),
|
|
Entry("RTX 50 12.0", "12.0", true),
|
|
Entry("future 13.0", "13.0", true),
|
|
Entry("Hopper 9.0", "9.0", false),
|
|
Entry("Ada 8.9", "8.9", false),
|
|
Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
|
|
Entry("unknown", "", false),
|
|
)
|
|
|
|
Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
|
|
It("returns the Blackwell batch on Blackwell", func() {
|
|
Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
|
|
})
|
|
It("returns the default batch otherwise", func() {
|
|
Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
|
|
Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
|
|
})
|
|
It("recognizes managed defaults but not explicit values", func() {
|
|
Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
|
|
Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
|
|
Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
|
|
})
|
|
})
|
|
|
|
Describe("ApplyHardwareDefaults", func() {
|
|
It("raises an unset batch to 2048 on Blackwell", func() {
|
|
cfg := &ModelConfig{}
|
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
|
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
|
})
|
|
It("leaves batch unset on non-Blackwell", func() {
|
|
cfg := &ModelConfig{}
|
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
|
|
Expect(cfg.Batch).To(Equal(0))
|
|
})
|
|
It("never overrides an explicit batch", func() {
|
|
cfg := &ModelConfig{}
|
|
cfg.Batch = 1024
|
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
|
Expect(cfg.Batch).To(Equal(1024))
|
|
})
|
|
It("no-ops on nil", func() {
|
|
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
|
|
})
|
|
})
|
|
|
|
const gib = uint64(1) << 30
|
|
|
|
DescribeTable("DefaultParallelSlots (by VRAM)",
|
|
func(vramGiB uint64, want int) {
|
|
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
|
|
},
|
|
Entry("GB10 119 GiB", uint64(119), 8),
|
|
Entry("48 GiB", uint64(48), 8),
|
|
Entry("24 GiB", uint64(24), 4),
|
|
Entry("8 GiB", uint64(8), 4),
|
|
Entry("6 GiB", uint64(6), 2),
|
|
Entry("2 GiB", uint64(2), 1),
|
|
Entry("unknown 0", uint64(0), 1),
|
|
)
|
|
|
|
Describe("ApplyHardwareDefaults parallel slots", func() {
|
|
It("adds a VRAM-scaled parallel option on a capable GPU", func() {
|
|
cfg := &ModelConfig{}
|
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
|
Expect(cfg.Options).To(ContainElement("parallel:8"))
|
|
})
|
|
It("scales the slot count down with VRAM", func() {
|
|
cfg := &ModelConfig{}
|
|
ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
|
|
Expect(cfg.Options).To(ContainElement("parallel:4"))
|
|
})
|
|
It("adds no parallel option on small/unknown VRAM", func() {
|
|
cfg := &ModelConfig{}
|
|
ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
|
|
Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
|
|
})
|
|
It("never overrides an explicit parallel option", func() {
|
|
cfg := &ModelConfig{Options: []string{"parallel:2"}}
|
|
ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
|
|
Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
|
|
})
|
|
})
|
|
})
|