Files
LocalAI/core/config/hardware_defaults_test.go
LocalAI [bot] 79783120dd fix(config): gate parallel-slot default on per-device VRAM too (#10485) (#10507)
The first #10485 fix (#10494) made the Blackwell physical-batch boost
per-device/context-aware, which neutralized the big compute-buffer OOM, but
the reporter's 2x16 GiB consumer Blackwell still OOM'd. Tracing the post-fix
log: the model now loads its weights, builds the main context and warms up
fine, and dies only on the *last* allocation — the MTP draft context's 800 MiB
KV cache on the tighter device.

#10411 changed only two defaults: the physical batch (now gated) and a
VRAM-scaled parallel-slot count. The KV cache is unified (n_ctx_seq == full
context proves slots share the budget, so parallel doesn't multiply KV), but
n_seq_max=4 still adds per-slot compute-graph / context-checkpoint / output
scratch. On a device packed ~99% by a 27B model spanning both cards, that
overhead is the few-hundred-MiB straw — which is why reverting #10411 (and only
#10411) restores a working load.

Gate the parallel-slot default on the same per-device headroom predicate as the
batch boost: when a large context already fills a single card
(largeContextForDevice), keep n_parallel=1. A user running one big-context model
that barely fits across two consumer GPUs is not serving four concurrent
tenants. Small contexts and large unified-memory devices (GB10) keep full
concurrency. Applied on both the single-host path and the distributed router.

Also make the auto-tuning visible and reversible (the debugging here needed
DEBUG logs and a git bisect):

  - Log the effective performance-relevant runtime options at INFO once per
    model load ("effective runtime tuning …": context, n_batch, n_gpu_layers,
    parallel, flash_attention, f16) so an admin can see what will run and pin or
    override any value in the model YAML.
  - LOCALAI_DISABLE_HARDWARE_DEFAULTS=true skips the hardware auto-tuning
    entirely (mirrors LOCALAI_DISABLE_GUESSING) for stock llama.cpp behavior.


Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-25 15:48:23 +02:00

174 lines
7.3 KiB
Go

package config_test
import (
. "github.com/mudler/LocalAI/core/config"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("Hardware-driven config defaults", func() {
const gib = uint64(1) << 30
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
func(cc string, want bool) {
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
},
Entry("GB10 12.1", "12.1", true),
Entry("RTX 50 12.0", "12.0", true),
Entry("future 13.0", "13.0", true),
Entry("Hopper 9.0", "9.0", false),
Entry("Ada 8.9", "8.9", false),
Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
Entry("unknown", "", false),
)
Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
It("returns the Blackwell batch on Blackwell", func() {
Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
})
It("returns the default batch otherwise", func() {
Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
})
It("recognizes managed defaults but not explicit values", func() {
Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
})
})
Describe("PhysicalBatchForContext (per-device VRAM headroom)", func() {
It("raises the batch when the compute buffer fits the device", func() {
// 16 GiB Blackwell with a small context: the extra scratch is tiny.
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 8192)).
To(Equal(BlackwellPhysicalBatch))
})
It("keeps the default batch when a large context would overflow one device", func() {
// The issue #10485 case: 16 GiB consumer Blackwell, ~200k context.
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 204800)).
To(Equal(DefaultPhysicalBatch))
})
It("still raises the batch on a large unified-memory device (GB10)", func() {
// GB10 reports system RAM (~119 GiB) as its single device's VRAM.
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1", VRAM: 119 * gib}, 204800)).
To(Equal(BlackwellPhysicalBatch))
})
It("stays conservative when VRAM is unknown", func() {
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1"}, 8192)).
To(Equal(DefaultPhysicalBatch))
})
It("never raises the batch on non-Blackwell", func() {
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "9.0", VRAM: 80 * gib}, 8192)).
To(Equal(DefaultPhysicalBatch))
})
})
Describe("ApplyHardwareDefaults", func() {
It("raises an unset batch to 2048 on Blackwell with headroom", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
})
It("leaves batch unset when a large context would overflow one device", func() {
// Regression guard for issue #10485: 16 GiB card + ~200k context.
ctx := 204800
cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
Expect(cfg.Batch).To(Equal(0))
})
It("leaves batch unset on non-Blackwell", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0", VRAM: 119 * gib})
Expect(cfg.Batch).To(Equal(0))
})
It("never overrides an explicit batch", func() {
cfg := &ModelConfig{}
cfg.Batch = 1024
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
Expect(cfg.Batch).To(Equal(1024))
})
It("no-ops on nil", func() {
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
})
It("applies nothing when hardware defaults are disabled via env", func() {
GinkgoT().Setenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS", "true")
Expect(HardwareDefaultsDisabled()).To(BeTrue())
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
Expect(cfg.Batch).To(Equal(0))
Expect(cfg.Options).To(BeEmpty())
})
})
DescribeTable("DefaultParallelSlots (by VRAM)",
func(vramGiB uint64, want int) {
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
},
Entry("GB10 119 GiB", uint64(119), 8),
Entry("48 GiB", uint64(48), 8),
Entry("24 GiB", uint64(24), 4),
Entry("8 GiB", uint64(8), 4),
Entry("6 GiB", uint64(6), 2),
Entry("2 GiB", uint64(2), 1),
Entry("unknown 0", uint64(0), 1),
)
Describe("ParallelSlotsForContext (per-device VRAM headroom)", func() {
It("keeps the VRAM-scaled slot count when the context fits the device", func() {
// 16 GiB card, small context: plenty of room for concurrency.
Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 8192)).To(Equal(4))
})
It("drops to a single slot when a large context already fills the device", func() {
// Regression guard for issue #10485: 16 GiB consumer Blackwell, ~200k
// context. Even with unified KV, the per-slot compute/checkpoint
// scratch from 4 slots is the straw that overflows the tighter device.
Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 204800)).To(Equal(1))
})
It("keeps concurrency on a large unified-memory device (GB10)", func() {
// GB10 reports system RAM (~119 GiB): a 200k context leaves headroom.
Expect(ParallelSlotsForContext(GPU{VRAM: 119 * gib}, 204800)).To(Equal(8))
})
It("keeps concurrency on a big datacenter card with a large context", func() {
// 80 GiB A100: 200k context is a small fraction, concurrency stays.
Expect(ParallelSlotsForContext(GPU{VRAM: 80 * gib}, 204800)).To(Equal(8))
})
It("stays a single slot on small/unknown VRAM regardless of context", func() {
Expect(ParallelSlotsForContext(GPU{VRAM: 2 * gib}, 8192)).To(Equal(1))
Expect(ParallelSlotsForContext(GPU{}, 8192)).To(Equal(1))
})
})
Describe("ApplyHardwareDefaults parallel slots", func() {
It("adds a VRAM-scaled parallel option on a capable GPU", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
Expect(cfg.Options).To(ContainElement("parallel:8"))
})
It("adds no parallel option when a large context already fills one device", func() {
// Regression guard for issue #10485: 16 GiB card + ~200k context. The
// model barely fits; defaulting concurrency tips the tighter GPU into
// CUDA OOM during the final (MTP draft) KV allocation.
ctx := 204800
cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
})
It("scales the slot count down with VRAM", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
Expect(cfg.Options).To(ContainElement("parallel:4"))
})
It("adds no parallel option on small/unknown VRAM", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
})
It("never overrides an explicit parallel option", func() {
cfg := &ModelConfig{Options: []string{"parallel:2"}}
ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
})
})
})