mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-25 00:59:28 -04:00
The hardware-tuned defaults from #10411 were measured on a GB10 / DGX Spark (128 GiB unified memory) and over-provisioned multi-GPU consumer Blackwell (e.g. 2x16 GiB RTX 50-series) into CUDA OOM during model init: - The Blackwell physical batch (512 -> 2048) sets both n_batch and n_ubatch. The compute buffer scales ~n_ubatch * n_ctx and is allocated PER DEVICE (it can't be split across GPUs), so a large context turns ub2048 into multi-GiB of scratch that must fit one 16 GiB card. - The VRAM-scaled parallel-slot default tiered off TotalAvailableVRAM(), which SUMS all GPUs (2x16 -> "32 GiB" -> 8 slots), but the allocations are per-device. Make both decisions per-device and context-aware: - xsysinfo.MinPerGPUVRAM() reports the smallest device's VRAM; localGPU() uses it so the parallel tier and batch guard reason about one card. - PhysicalBatchForContext(gpu, ctx) raises the batch only when the extra compute buffer fits VRAM/4 at this model's context (16 GiB crosses over ~174k ctx, 32 GiB ~349k; GB10 reports system RAM so it still clears it). - Apply hardware defaults AFTER runBackendHooks in SetDefaults so the GGUF-guessed context is resolved before the batch decision. - The distributed router gates the node batch the same way. Unified-memory devices (GB10, Apple) report system RAM as their single device's VRAM, so they keep the prefill win. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
131 lines
5.1 KiB
Go
131 lines
5.1 KiB
Go
package config_test
|
|
|
|
import (
|
|
. "github.com/mudler/LocalAI/core/config"
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
var _ = Describe("Hardware-driven config defaults", func() {
|
|
const gib = uint64(1) << 30
|
|
|
|
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
|
|
func(cc string, want bool) {
|
|
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
|
|
},
|
|
Entry("GB10 12.1", "12.1", true),
|
|
Entry("RTX 50 12.0", "12.0", true),
|
|
Entry("future 13.0", "13.0", true),
|
|
Entry("Hopper 9.0", "9.0", false),
|
|
Entry("Ada 8.9", "8.9", false),
|
|
Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
|
|
Entry("unknown", "", false),
|
|
)
|
|
|
|
Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
|
|
It("returns the Blackwell batch on Blackwell", func() {
|
|
Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
|
|
})
|
|
It("returns the default batch otherwise", func() {
|
|
Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
|
|
Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
|
|
})
|
|
It("recognizes managed defaults but not explicit values", func() {
|
|
Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
|
|
Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
|
|
Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
|
|
})
|
|
})
|
|
|
|
Describe("PhysicalBatchForContext (per-device VRAM headroom)", func() {
|
|
It("raises the batch when the compute buffer fits the device", func() {
|
|
// 16 GiB Blackwell with a small context: the extra scratch is tiny.
|
|
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 8192)).
|
|
To(Equal(BlackwellPhysicalBatch))
|
|
})
|
|
It("keeps the default batch when a large context would overflow one device", func() {
|
|
// The issue #10485 case: 16 GiB consumer Blackwell, ~200k context.
|
|
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 204800)).
|
|
To(Equal(DefaultPhysicalBatch))
|
|
})
|
|
It("still raises the batch on a large unified-memory device (GB10)", func() {
|
|
// GB10 reports system RAM (~119 GiB) as its single device's VRAM.
|
|
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1", VRAM: 119 * gib}, 204800)).
|
|
To(Equal(BlackwellPhysicalBatch))
|
|
})
|
|
It("stays conservative when VRAM is unknown", func() {
|
|
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1"}, 8192)).
|
|
To(Equal(DefaultPhysicalBatch))
|
|
})
|
|
It("never raises the batch on non-Blackwell", func() {
|
|
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "9.0", VRAM: 80 * gib}, 8192)).
|
|
To(Equal(DefaultPhysicalBatch))
|
|
})
|
|
})
|
|
|
|
Describe("ApplyHardwareDefaults", func() {
|
|
It("raises an unset batch to 2048 on Blackwell with headroom", func() {
|
|
cfg := &ModelConfig{}
|
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
|
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
|
})
|
|
It("leaves batch unset when a large context would overflow one device", func() {
|
|
// Regression guard for issue #10485: 16 GiB card + ~200k context.
|
|
ctx := 204800
|
|
cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
|
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
|
|
Expect(cfg.Batch).To(Equal(0))
|
|
})
|
|
It("leaves batch unset on non-Blackwell", func() {
|
|
cfg := &ModelConfig{}
|
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0", VRAM: 119 * gib})
|
|
Expect(cfg.Batch).To(Equal(0))
|
|
})
|
|
It("never overrides an explicit batch", func() {
|
|
cfg := &ModelConfig{}
|
|
cfg.Batch = 1024
|
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
|
Expect(cfg.Batch).To(Equal(1024))
|
|
})
|
|
It("no-ops on nil", func() {
|
|
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
|
|
})
|
|
})
|
|
|
|
DescribeTable("DefaultParallelSlots (by VRAM)",
|
|
func(vramGiB uint64, want int) {
|
|
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
|
|
},
|
|
Entry("GB10 119 GiB", uint64(119), 8),
|
|
Entry("48 GiB", uint64(48), 8),
|
|
Entry("24 GiB", uint64(24), 4),
|
|
Entry("8 GiB", uint64(8), 4),
|
|
Entry("6 GiB", uint64(6), 2),
|
|
Entry("2 GiB", uint64(2), 1),
|
|
Entry("unknown 0", uint64(0), 1),
|
|
)
|
|
|
|
Describe("ApplyHardwareDefaults parallel slots", func() {
|
|
It("adds a VRAM-scaled parallel option on a capable GPU", func() {
|
|
cfg := &ModelConfig{}
|
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
|
Expect(cfg.Options).To(ContainElement("parallel:8"))
|
|
})
|
|
It("scales the slot count down with VRAM", func() {
|
|
cfg := &ModelConfig{}
|
|
ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
|
|
Expect(cfg.Options).To(ContainElement("parallel:4"))
|
|
})
|
|
It("adds no parallel option on small/unknown VRAM", func() {
|
|
cfg := &ModelConfig{}
|
|
ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
|
|
Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
|
|
})
|
|
It("never overrides an explicit parallel option", func() {
|
|
cfg := &ModelConfig{Options: []string{"parallel:2"}}
|
|
ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
|
|
Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
|
|
})
|
|
})
|
|
})
|