From 87d5734c332d3879207145b2073838ce3df3d835 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 25 Jun 2026 12:56:01 +0000 Subject: [PATCH] fix(config): gate parallel-slot default on per-device VRAM too (#10485) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first #10485 fix (#10494) made the Blackwell physical-batch boost per-device/context-aware, which neutralized the big compute-buffer OOM, but the reporter's 2x16 GiB consumer Blackwell still OOM'd. Tracing the post-fix log: the model now loads its weights, builds the main context and warms up fine, and dies only on the *last* allocation — the MTP draft context's 800 MiB KV cache on the tighter device. #10411 changed only two defaults: the physical batch (now gated) and a VRAM-scaled parallel-slot count. The KV cache is unified (n_ctx_seq == full context proves slots share the budget, so parallel doesn't multiply KV), but n_seq_max=4 still adds per-slot compute-graph / context-checkpoint / output scratch. On a device packed ~99% by a 27B model spanning both cards, that overhead is the few-hundred-MiB straw — which is why reverting #10411 (and only #10411) restores a working load. Gate the parallel-slot default on the same per-device headroom predicate as the batch boost: when a large context already fills a single card (largeContextForDevice), keep n_parallel=1. A user running one big-context model that barely fits across two consumer GPUs is not serving four concurrent tenants. Small contexts and large unified-memory devices (GB10) keep full concurrency. Applied on both the single-host path and the distributed router. Also make the auto-tuning visible and reversible (the debugging here needed DEBUG logs and a git bisect): - Log the effective performance-relevant runtime options at INFO once per model load ("effective runtime tuning …": context, n_batch, n_gpu_layers, parallel, flash_attention, f16) so an admin can see what will run and pin or override any value in the model YAML. - LOCALAI_DISABLE_HARDWARE_DEFAULTS=true skips the hardware auto-tuning entirely (mirrors LOCALAI_DISABLE_GUESSING) for stock llama.cpp behavior. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:opus-4.8 [Claude Code] --- core/config/hardware_defaults.go | 111 ++++++++++++++---- core/config/hardware_defaults_test.go | 43 +++++++ core/services/nodes/router.go | 9 +- .../nodes/router_hardware_internal_test.go | 8 ++ docs/content/features/text-generation.md | 10 ++ pkg/model/initializers.go | 30 +++++ pkg/model/initializers_internal_test.go | 19 +++ 7 files changed, 205 insertions(+), 25 deletions(-) create mode 100644 pkg/model/initializers_internal_test.go diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go index b4e0e74c6..81bc9fc7f 100644 --- a/core/config/hardware_defaults.go +++ b/core/config/hardware_defaults.go @@ -2,6 +2,7 @@ package config import ( "fmt" + "os" "strconv" "strings" @@ -9,6 +10,19 @@ import ( "github.com/mudler/xlog" ) +// HardwareDefaultsDisabled reports whether hardware auto-tuning is turned off via +// LOCALAI_DISABLE_HARDWARE_DEFAULTS=true (mirrors LOCALAI_DISABLE_GUESSING). When +// set, ApplyHardwareDefaults and the distributed router's node tuning are +// skipped entirely, so the backend runs llama.cpp's stock batch/parallel +// behavior — an escape hatch for users who want predictable, un-tuned defaults. +func HardwareDefaultsDisabled() bool { + // Read directly like the sibling LOCALAI_DISABLE_GUESSING toggle in + // hooks_llamacpp.go: these config-layer heuristic switches run deep in the + // defaults pipeline with no ApplicationConfig in scope to plumb through. + //nolint:forbidigo // config-layer heuristic toggle, mirrors LOCALAI_DISABLE_GUESSING + return os.Getenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS") == "true" +} + // Hardware-driven model-config defaults. // // This sits alongside the other config overriders (ApplyInferenceDefaults for @@ -103,17 +117,36 @@ func PhysicalBatchForContext(g GPU, ctx int) int { if !g.IsNVIDIABlackwell() { return DefaultPhysicalBatch } - if ctx <= 0 { - ctx = DefaultContextSize - } if g.VRAM == 0 { return DefaultPhysicalBatch } - extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell - if extra <= g.VRAM/blackwellBatchHeadroomDivisor { - return BlackwellPhysicalBatch + if largeContextForDevice(g, ctx) { + return DefaultPhysicalBatch } - return DefaultPhysicalBatch + return BlackwellPhysicalBatch +} + +// largeContextForDevice reports whether the given context is large relative to +// the per-device VRAM ceiling — the shared "tight single-model fit" signal that +// suppresses BOTH throughput-oriented defaults (the Blackwell batch boost and +// the concurrency slot count). It sizes the extra compute-buffer scratch a +// raised batch would need at this context (which grows ~n_ubatch * n_ctx and +// is allocated per device) and asks whether it overflows a fraction of the +// device VRAM; when it does, the device has no headroom to spend on throughput +// and the conservative defaults must hold (issue #10485). +// +// g.VRAM must be the PER-DEVICE ceiling (the smallest device on a multi-GPU +// host). VRAM 0 (unknown) is treated as not-large so detection gaps don't +// silently disable the defaults. +func largeContextForDevice(g GPU, ctx int) bool { + if g.VRAM == 0 { + return false + } + if ctx <= 0 { + ctx = DefaultContextSize + } + extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell + return extra > g.VRAM/blackwellBatchHeadroomDivisor } // IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns. @@ -152,17 +185,50 @@ func DefaultParallelSlots(g GPU) int { } } -// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the -// model doesn't already set one (and the GPU warrants concurrency). Returns the -// possibly-extended options. Shared by the single-host config path -// (ApplyHardwareDefaults) and the distributed router (per selected node). -func EnsureParallelOption(opts []string, gpu GPU) []string { - if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) { +// ParallelSlotsForContext is DefaultParallelSlots gated on per-device VRAM +// headroom for the given context. A large context already claims most of a +// single device's VRAM (the KV cache plus the per-slot compute/checkpoint +// scratch that scales with n_seq_max), so defaulting multiple slots there +// pushes a tight single-model fit into per-device CUDA OOM (issue #10485): the +// model loads but the final allocation (e.g. an MTP draft context's KV cache) +// overflows the tighter card by a few hundred MiB. Returns 1 (no concurrency) +// in that tight regime, otherwise the VRAM-scaled DefaultParallelSlots. +// +// g.VRAM must be the PER-DEVICE ceiling (smallest device on a multi-GPU host). +// It shares largeContextForDevice with the batch boost so both throughput +// defaults are suppressed together; the GB10 / unified-memory path reports +// system RAM and so keeps full concurrency even at large contexts. +func ParallelSlotsForContext(g GPU, ctx int) int { + slots := DefaultParallelSlots(g) + if slots <= 1 || g.VRAM == 0 { + return slots + } + if largeContextForDevice(g, ctx) { + return 1 + } + return slots +} + +// EnsureParallelOptionForContext appends a VRAM-scaled "parallel:N" backend +// option when the model doesn't already set one and the GPU warrants (and has +// headroom for) concurrency at this context. Returns the possibly-extended +// options. Shared by the single-host config path (ApplyHardwareDefaults) and +// the distributed router (per selected node). +func EnsureParallelOptionForContext(opts []string, gpu GPU, ctx int) []string { + if slots := ParallelSlotsForContext(gpu, ctx); slots > 1 && !hasParallelOption(opts) { return append(opts, fmt.Sprintf("parallel:%d", slots)) } return opts } +// EnsureParallelOption is EnsureParallelOptionForContext with no known context +// (defaults to DefaultContextSize, which clears the headroom gate on any device +// large enough to warrant concurrency). Kept for callers without a model +// context. +func EnsureParallelOption(opts []string, gpu GPU) []string { + return EnsureParallelOptionForContext(opts, gpu, 0) +} + // hasParallelOption reports whether the model already sets parallel/n_parallel // so we never override an explicit value (helper shared with serving_defaults.go). func hasParallelOption(opts []string) bool { @@ -192,18 +258,18 @@ var localGPU = func() GPU { // and were left unset by the user. Currently: a larger physical batch on // Blackwell. Explicit config always wins (we only touch zero values). func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) { - if cfg == nil { + if cfg == nil || HardwareDefaultsDisabled() { return } // Raise the physical batch on Blackwell only when the resulting compute // buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0 // (rather than writing the default 512) preserves the downstream single-pass // sizing in core/backend.EffectiveBatchSize for embedding/score/rerank. + ctx := DefaultContextSize + if cfg.ContextSize != nil { + ctx = *cfg.ContextSize + } if cfg.Batch == 0 { - ctx := DefaultContextSize - if cfg.ContextSize != nil { - ctx = *cfg.ContextSize - } if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch { cfg.Batch = BlackwellPhysicalBatch xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch", @@ -214,13 +280,14 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) { // Enable concurrent serving by default on a capable GPU: without this the // llama.cpp backend runs n_parallel=1 and serializes multi-user requests // (continuous batching stays off). Unified KV means the slots share the - // context budget, so this is concurrency without extra KV memory. Explicit - // parallel/n_parallel in the model options always wins. + // context budget, but a context large enough to fill a single device leaves + // no room for the per-slot scratch, so the slot count is gated on per-device + // headroom too (issue #10485). Explicit parallel/n_parallel always wins. if before := len(cfg.Options); true { - cfg.Options = EnsureParallelOption(cfg.Options, gpu) + cfg.Options = EnsureParallelOptionForContext(cfg.Options, gpu, ctx) if len(cfg.Options) > before { xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving", - "option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30) + "option", cfg.Options[len(cfg.Options)-1], "context", ctx, "vram_gib", gpu.VRAM>>30) } } } diff --git a/core/config/hardware_defaults_test.go b/core/config/hardware_defaults_test.go index 3bc1bf297..452a5a884 100644 --- a/core/config/hardware_defaults_test.go +++ b/core/config/hardware_defaults_test.go @@ -90,6 +90,15 @@ var _ = Describe("Hardware-driven config defaults", func() { It("no-ops on nil", func() { Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic()) }) + + It("applies nothing when hardware defaults are disabled via env", func() { + GinkgoT().Setenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS", "true") + Expect(HardwareDefaultsDisabled()).To(BeTrue()) + cfg := &ModelConfig{} + ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib}) + Expect(cfg.Batch).To(Equal(0)) + Expect(cfg.Options).To(BeEmpty()) + }) }) DescribeTable("DefaultParallelSlots (by VRAM)", @@ -105,12 +114,46 @@ var _ = Describe("Hardware-driven config defaults", func() { Entry("unknown 0", uint64(0), 1), ) + Describe("ParallelSlotsForContext (per-device VRAM headroom)", func() { + It("keeps the VRAM-scaled slot count when the context fits the device", func() { + // 16 GiB card, small context: plenty of room for concurrency. + Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 8192)).To(Equal(4)) + }) + It("drops to a single slot when a large context already fills the device", func() { + // Regression guard for issue #10485: 16 GiB consumer Blackwell, ~200k + // context. Even with unified KV, the per-slot compute/checkpoint + // scratch from 4 slots is the straw that overflows the tighter device. + Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 204800)).To(Equal(1)) + }) + It("keeps concurrency on a large unified-memory device (GB10)", func() { + // GB10 reports system RAM (~119 GiB): a 200k context leaves headroom. + Expect(ParallelSlotsForContext(GPU{VRAM: 119 * gib}, 204800)).To(Equal(8)) + }) + It("keeps concurrency on a big datacenter card with a large context", func() { + // 80 GiB A100: 200k context is a small fraction, concurrency stays. + Expect(ParallelSlotsForContext(GPU{VRAM: 80 * gib}, 204800)).To(Equal(8)) + }) + It("stays a single slot on small/unknown VRAM regardless of context", func() { + Expect(ParallelSlotsForContext(GPU{VRAM: 2 * gib}, 8192)).To(Equal(1)) + Expect(ParallelSlotsForContext(GPU{}, 8192)).To(Equal(1)) + }) + }) + Describe("ApplyHardwareDefaults parallel slots", func() { It("adds a VRAM-scaled parallel option on a capable GPU", func() { cfg := &ModelConfig{} ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib}) Expect(cfg.Options).To(ContainElement("parallel:8")) }) + It("adds no parallel option when a large context already fills one device", func() { + // Regression guard for issue #10485: 16 GiB card + ~200k context. The + // model barely fits; defaulting concurrency tips the tighter GPU into + // CUDA OOM during the final (MTP draft) KV allocation. + ctx := 204800 + cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}} + ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib}) + Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel"))) + }) It("scales the slot count down with VRAM", func() { cfg := &ModelConfig{} ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib}) diff --git a/core/services/nodes/router.go b/core/services/nodes/router.go index 6ad550cf1..ce3de3290 100644 --- a/core/services/nodes/router.go +++ b/core/services/nodes/router.go @@ -147,7 +147,7 @@ type scheduleLoadResult struct { // Only values the heuristics themselves manage are touched, so an explicit user // batch (e.g. 1024) is never overridden. func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) { - if opts == nil || node == nil { + if opts == nil || node == nil || config.HardwareDefaultsDisabled() { return } gpu := config.GPU{ @@ -162,8 +162,11 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) { opts.NBatch = int32(config.PhysicalBatchForContext(gpu, int(opts.ContextSize))) } // Default concurrent serving for the selected node (the frontend that built - // the options may have no GPU). Only adds when no parallel option is set. - opts.Options = config.EnsureParallelOption(opts.Options, gpu) + // the options may have no GPU). Gated on the node's per-device VRAM at this + // model's context, so a large context that already fills the device can't + // tip it into OOM by adding slot scratch (issue #10485). Only adds when no + // parallel option is set. + opts.Options = config.EnsureParallelOptionForContext(opts.Options, gpu, int(opts.ContextSize)) } // scheduleAndLoad is the shared core for loading a model on a new node. diff --git a/core/services/nodes/router_hardware_internal_test.go b/core/services/nodes/router_hardware_internal_test.go index d8576c4e4..084222fee 100644 --- a/core/services/nodes/router_hardware_internal_test.go +++ b/core/services/nodes/router_hardware_internal_test.go @@ -41,6 +41,14 @@ var _ = Describe("applyNodeHardwareDefaults", func() { Expect(opts.Options).To(ContainElement("parallel:8")) }) + It("adds no parallel option when a large context already fills the node device", func() { + // Regression guard for issue #10485: a 16 GiB node with a ~200k context + // is a tight single-model fit — the slot scratch would tip it into OOM. + opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 204800} + applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.0", TotalVRAM: 16 << 30}) + Expect(opts.Options).ToNot(ContainElement(ContainSubstring("parallel"))) + }) + It("never overrides an explicit parallel option on the node path", func() { opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, Options: []string{"parallel:2"}} applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30}) diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md index c09717a3f..cadc67808 100644 --- a/docs/content/features/text-generation.md +++ b/docs/content/features/text-generation.md @@ -537,6 +537,16 @@ options: **Note:** The `parallel` option can also be set via the `LLAMACPP_PARALLEL` environment variable, and `grpc_servers` can be set via the `LLAMACPP_GRPC_SERVERS` environment variable. Options specified in the YAML file take precedence over environment variables. +##### Hardware auto-tuning (and how to override it) + +On a detected GPU, LocalAI fills a few performance-relevant defaults the model config leaves unset — a larger physical batch on NVIDIA Blackwell, and a VRAM-scaled `parallel` slot count for concurrent serving. Both are gated on **per-device** VRAM at the model's context: when a large context already fills a single card (e.g. a 27B model with a 200k context across 2×16 GiB), the batch boost and the extra parallel slots are suppressed so they can't tip the tighter GPU into CUDA out-of-memory. + +Anything you set explicitly in the model YAML always wins, so to pin a value just set it (e.g. `batch: 512` or `options: ["parallel:1"]`). The effective values are logged at `INFO` when a model loads (`effective runtime tuning …`). To turn the hardware auto-tuning off entirely and run llama.cpp's stock behavior, set: + +``` +LOCALAI_DISABLE_HARDWARE_DEFAULTS=true +``` + ##### Server-side prompt cache (repeated system prompts) Agents, coding assistants, and Anthropic/OpenAI-compatible CLIs typically resend the same large system prompt on every turn. The llama.cpp server can short-circuit prefill for the matching prefix by stashing idle slot KV states in host RAM and reloading them on a hit. Three settings interact: diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index fdae562fe..509e58e68 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -169,11 +169,41 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string } } +// parallelSlotsFromOptions returns the effective n_parallel from the backend +// option strings ("parallel:N" / "n_parallel:N"), or "1" when unset — the +// llama.cpp default. Used only for the effective-tuning load log. +func parallelSlotsFromOptions(opts []string) string { + for _, o := range opts { + k, v, ok := strings.Cut(o, ":") + if ok && (k == "parallel" || k == "n_parallel") { + return strings.TrimSpace(v) + } + } + return "1" +} + func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err error) { o := NewOptions(opts...) xlog.Info("BackendLoader starting", "modelID", o.modelID, "backend", o.backendString, "model", o.model) + // Surface the effective performance-relevant runtime options at load (some of + // these are auto-tuned for the detected hardware). Logged once per load so an + // admin can see what will actually run and pin or override any value in the + // model YAML — or set LOCALAI_DISABLE_HARDWARE_DEFAULTS=true to turn the + // hardware auto-tuning off entirely. Gated on an LLM-ish load (context set) so + // TTS/audio/other backends stay quiet. + if opt := o.gRPCOptions; opt != nil && opt.ContextSize > 0 { + xlog.Info("effective runtime tuning (override in the model YAML; LOCALAI_DISABLE_HARDWARE_DEFAULTS=true disables hardware auto-tuning)", + "modelID", o.modelID, + "context", opt.ContextSize, + "n_batch", opt.NBatch, + "n_gpu_layers", opt.NGPULayers, + "parallel", parallelSlotsFromOptions(opt.Options), + "flash_attention", opt.FlashAttention, + "f16", opt.F16Memory) + } + backend := strings.ToLower(o.backendString) if realBackend, exists := Aliases[backend]; exists { typeAlias, exists := TypeAlias[backend] diff --git a/pkg/model/initializers_internal_test.go b/pkg/model/initializers_internal_test.go new file mode 100644 index 000000000..6988f1aa2 --- /dev/null +++ b/pkg/model/initializers_internal_test.go @@ -0,0 +1,19 @@ +package model + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("parallelSlotsFromOptions", func() { + It("reads the parallel slot count from the backend options", func() { + Expect(parallelSlotsFromOptions([]string{"use_jinja:true", "parallel:4"})).To(Equal("4")) + }) + It("accepts the n_parallel alias", func() { + Expect(parallelSlotsFromOptions([]string{"n_parallel:8"})).To(Equal("8")) + }) + It("defaults to a single slot when unset", func() { + Expect(parallelSlotsFromOptions([]string{"use_jinja:true"})).To(Equal("1")) + Expect(parallelSlotsFromOptions(nil)).To(Equal("1")) + }) +})