Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
87d5734c33 fix(config): gate parallel-slot default on per-device VRAM too (#10485)
The first #10485 fix (#10494) made the Blackwell physical-batch boost
per-device/context-aware, which neutralized the big compute-buffer OOM, but
the reporter's 2x16 GiB consumer Blackwell still OOM'd. Tracing the post-fix
log: the model now loads its weights, builds the main context and warms up
fine, and dies only on the *last* allocation — the MTP draft context's 800 MiB
KV cache on the tighter device.

#10411 changed only two defaults: the physical batch (now gated) and a
VRAM-scaled parallel-slot count. The KV cache is unified (n_ctx_seq == full
context proves slots share the budget, so parallel doesn't multiply KV), but
n_seq_max=4 still adds per-slot compute-graph / context-checkpoint / output
scratch. On a device packed ~99% by a 27B model spanning both cards, that
overhead is the few-hundred-MiB straw — which is why reverting #10411 (and only
#10411) restores a working load.

Gate the parallel-slot default on the same per-device headroom predicate as the
batch boost: when a large context already fills a single card
(largeContextForDevice), keep n_parallel=1. A user running one big-context model
that barely fits across two consumer GPUs is not serving four concurrent
tenants. Small contexts and large unified-memory devices (GB10) keep full
concurrency. Applied on both the single-host path and the distributed router.

Also make the auto-tuning visible and reversible (the debugging here needed
DEBUG logs and a git bisect):

  - Log the effective performance-relevant runtime options at INFO once per
    model load ("effective runtime tuning …": context, n_batch, n_gpu_layers,
    parallel, flash_attention, f16) so an admin can see what will run and pin or
    override any value in the model YAML.
  - LOCALAI_DISABLE_HARDWARE_DEFAULTS=true skips the hardware auto-tuning
    entirely (mirrors LOCALAI_DISABLE_GUESSING) for stock llama.cpp behavior.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]
2026-06-25 12:57:19 +00:00
7 changed files with 205 additions and 25 deletions

View File

@@ -2,6 +2,7 @@ package config
import (
"fmt"
"os"
"strconv"
"strings"
@@ -9,6 +10,19 @@ import (
"github.com/mudler/xlog"
)
// HardwareDefaultsDisabled reports whether hardware auto-tuning is turned off via
// LOCALAI_DISABLE_HARDWARE_DEFAULTS=true (mirrors LOCALAI_DISABLE_GUESSING). When
// set, ApplyHardwareDefaults and the distributed router's node tuning are
// skipped entirely, so the backend runs llama.cpp's stock batch/parallel
// behavior — an escape hatch for users who want predictable, un-tuned defaults.
func HardwareDefaultsDisabled() bool {
// Read directly like the sibling LOCALAI_DISABLE_GUESSING toggle in
// hooks_llamacpp.go: these config-layer heuristic switches run deep in the
// defaults pipeline with no ApplicationConfig in scope to plumb through.
//nolint:forbidigo // config-layer heuristic toggle, mirrors LOCALAI_DISABLE_GUESSING
return os.Getenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS") == "true"
}
// Hardware-driven model-config defaults.
//
// This sits alongside the other config overriders (ApplyInferenceDefaults for
@@ -103,17 +117,36 @@ func PhysicalBatchForContext(g GPU, ctx int) int {
if !g.IsNVIDIABlackwell() {
return DefaultPhysicalBatch
}
if ctx <= 0 {
ctx = DefaultContextSize
}
if g.VRAM == 0 {
return DefaultPhysicalBatch
}
extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
if extra <= g.VRAM/blackwellBatchHeadroomDivisor {
return BlackwellPhysicalBatch
if largeContextForDevice(g, ctx) {
return DefaultPhysicalBatch
}
return DefaultPhysicalBatch
return BlackwellPhysicalBatch
}
// largeContextForDevice reports whether the given context is large relative to
// the per-device VRAM ceiling — the shared "tight single-model fit" signal that
// suppresses BOTH throughput-oriented defaults (the Blackwell batch boost and
// the concurrency slot count). It sizes the extra compute-buffer scratch a
// raised batch would need at this context (which grows ~n_ubatch * n_ctx and
// is allocated per device) and asks whether it overflows a fraction of the
// device VRAM; when it does, the device has no headroom to spend on throughput
// and the conservative defaults must hold (issue #10485).
//
// g.VRAM must be the PER-DEVICE ceiling (the smallest device on a multi-GPU
// host). VRAM 0 (unknown) is treated as not-large so detection gaps don't
// silently disable the defaults.
func largeContextForDevice(g GPU, ctx int) bool {
if g.VRAM == 0 {
return false
}
if ctx <= 0 {
ctx = DefaultContextSize
}
extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
return extra > g.VRAM/blackwellBatchHeadroomDivisor
}
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
@@ -152,17 +185,50 @@ func DefaultParallelSlots(g GPU) int {
}
}
// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
// model doesn't already set one (and the GPU warrants concurrency). Returns the
// possibly-extended options. Shared by the single-host config path
// (ApplyHardwareDefaults) and the distributed router (per selected node).
func EnsureParallelOption(opts []string, gpu GPU) []string {
if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
// ParallelSlotsForContext is DefaultParallelSlots gated on per-device VRAM
// headroom for the given context. A large context already claims most of a
// single device's VRAM (the KV cache plus the per-slot compute/checkpoint
// scratch that scales with n_seq_max), so defaulting multiple slots there
// pushes a tight single-model fit into per-device CUDA OOM (issue #10485): the
// model loads but the final allocation (e.g. an MTP draft context's KV cache)
// overflows the tighter card by a few hundred MiB. Returns 1 (no concurrency)
// in that tight regime, otherwise the VRAM-scaled DefaultParallelSlots.
//
// g.VRAM must be the PER-DEVICE ceiling (smallest device on a multi-GPU host).
// It shares largeContextForDevice with the batch boost so both throughput
// defaults are suppressed together; the GB10 / unified-memory path reports
// system RAM and so keeps full concurrency even at large contexts.
func ParallelSlotsForContext(g GPU, ctx int) int {
slots := DefaultParallelSlots(g)
if slots <= 1 || g.VRAM == 0 {
return slots
}
if largeContextForDevice(g, ctx) {
return 1
}
return slots
}
// EnsureParallelOptionForContext appends a VRAM-scaled "parallel:N" backend
// option when the model doesn't already set one and the GPU warrants (and has
// headroom for) concurrency at this context. Returns the possibly-extended
// options. Shared by the single-host config path (ApplyHardwareDefaults) and
// the distributed router (per selected node).
func EnsureParallelOptionForContext(opts []string, gpu GPU, ctx int) []string {
if slots := ParallelSlotsForContext(gpu, ctx); slots > 1 && !hasParallelOption(opts) {
return append(opts, fmt.Sprintf("parallel:%d", slots))
}
return opts
}
// EnsureParallelOption is EnsureParallelOptionForContext with no known context
// (defaults to DefaultContextSize, which clears the headroom gate on any device
// large enough to warrant concurrency). Kept for callers without a model
// context.
func EnsureParallelOption(opts []string, gpu GPU) []string {
return EnsureParallelOptionForContext(opts, gpu, 0)
}
// hasParallelOption reports whether the model already sets parallel/n_parallel
// so we never override an explicit value (helper shared with serving_defaults.go).
func hasParallelOption(opts []string) bool {
@@ -192,18 +258,18 @@ var localGPU = func() GPU {
// and were left unset by the user. Currently: a larger physical batch on
// Blackwell. Explicit config always wins (we only touch zero values).
func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
if cfg == nil {
if cfg == nil || HardwareDefaultsDisabled() {
return
}
// Raise the physical batch on Blackwell only when the resulting compute
// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
// (rather than writing the default 512) preserves the downstream single-pass
// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
ctx := DefaultContextSize
if cfg.ContextSize != nil {
ctx = *cfg.ContextSize
}
if cfg.Batch == 0 {
ctx := DefaultContextSize
if cfg.ContextSize != nil {
ctx = *cfg.ContextSize
}
if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
cfg.Batch = BlackwellPhysicalBatch
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
@@ -214,13 +280,14 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
// Enable concurrent serving by default on a capable GPU: without this the
// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
// (continuous batching stays off). Unified KV means the slots share the
// context budget, so this is concurrency without extra KV memory. Explicit
// parallel/n_parallel in the model options always wins.
// context budget, but a context large enough to fill a single device leaves
// no room for the per-slot scratch, so the slot count is gated on per-device
// headroom too (issue #10485). Explicit parallel/n_parallel always wins.
if before := len(cfg.Options); true {
cfg.Options = EnsureParallelOption(cfg.Options, gpu)
cfg.Options = EnsureParallelOptionForContext(cfg.Options, gpu, ctx)
if len(cfg.Options) > before {
xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
"option", cfg.Options[len(cfg.Options)-1], "context", ctx, "vram_gib", gpu.VRAM>>30)
}
}
}

View File

@@ -90,6 +90,15 @@ var _ = Describe("Hardware-driven config defaults", func() {
It("no-ops on nil", func() {
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
})
It("applies nothing when hardware defaults are disabled via env", func() {
GinkgoT().Setenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS", "true")
Expect(HardwareDefaultsDisabled()).To(BeTrue())
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
Expect(cfg.Batch).To(Equal(0))
Expect(cfg.Options).To(BeEmpty())
})
})
DescribeTable("DefaultParallelSlots (by VRAM)",
@@ -105,12 +114,46 @@ var _ = Describe("Hardware-driven config defaults", func() {
Entry("unknown 0", uint64(0), 1),
)
Describe("ParallelSlotsForContext (per-device VRAM headroom)", func() {
It("keeps the VRAM-scaled slot count when the context fits the device", func() {
// 16 GiB card, small context: plenty of room for concurrency.
Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 8192)).To(Equal(4))
})
It("drops to a single slot when a large context already fills the device", func() {
// Regression guard for issue #10485: 16 GiB consumer Blackwell, ~200k
// context. Even with unified KV, the per-slot compute/checkpoint
// scratch from 4 slots is the straw that overflows the tighter device.
Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 204800)).To(Equal(1))
})
It("keeps concurrency on a large unified-memory device (GB10)", func() {
// GB10 reports system RAM (~119 GiB): a 200k context leaves headroom.
Expect(ParallelSlotsForContext(GPU{VRAM: 119 * gib}, 204800)).To(Equal(8))
})
It("keeps concurrency on a big datacenter card with a large context", func() {
// 80 GiB A100: 200k context is a small fraction, concurrency stays.
Expect(ParallelSlotsForContext(GPU{VRAM: 80 * gib}, 204800)).To(Equal(8))
})
It("stays a single slot on small/unknown VRAM regardless of context", func() {
Expect(ParallelSlotsForContext(GPU{VRAM: 2 * gib}, 8192)).To(Equal(1))
Expect(ParallelSlotsForContext(GPU{}, 8192)).To(Equal(1))
})
})
Describe("ApplyHardwareDefaults parallel slots", func() {
It("adds a VRAM-scaled parallel option on a capable GPU", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
Expect(cfg.Options).To(ContainElement("parallel:8"))
})
It("adds no parallel option when a large context already fills one device", func() {
// Regression guard for issue #10485: 16 GiB card + ~200k context. The
// model barely fits; defaulting concurrency tips the tighter GPU into
// CUDA OOM during the final (MTP draft) KV allocation.
ctx := 204800
cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
})
It("scales the slot count down with VRAM", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})

View File

@@ -147,7 +147,7 @@ type scheduleLoadResult struct {
// Only values the heuristics themselves manage are touched, so an explicit user
// batch (e.g. 1024) is never overridden.
func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
if opts == nil || node == nil {
if opts == nil || node == nil || config.HardwareDefaultsDisabled() {
return
}
gpu := config.GPU{
@@ -162,8 +162,11 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
opts.NBatch = int32(config.PhysicalBatchForContext(gpu, int(opts.ContextSize)))
}
// Default concurrent serving for the selected node (the frontend that built
// the options may have no GPU). Only adds when no parallel option is set.
opts.Options = config.EnsureParallelOption(opts.Options, gpu)
// the options may have no GPU). Gated on the node's per-device VRAM at this
// model's context, so a large context that already fills the device can't
// tip it into OOM by adding slot scratch (issue #10485). Only adds when no
// parallel option is set.
opts.Options = config.EnsureParallelOptionForContext(opts.Options, gpu, int(opts.ContextSize))
}
// scheduleAndLoad is the shared core for loading a model on a new node.

View File

@@ -41,6 +41,14 @@ var _ = Describe("applyNodeHardwareDefaults", func() {
Expect(opts.Options).To(ContainElement("parallel:8"))
})
It("adds no parallel option when a large context already fills the node device", func() {
// Regression guard for issue #10485: a 16 GiB node with a ~200k context
// is a tight single-model fit — the slot scratch would tip it into OOM.
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 204800}
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.0", TotalVRAM: 16 << 30})
Expect(opts.Options).ToNot(ContainElement(ContainSubstring("parallel")))
})
It("never overrides an explicit parallel option on the node path", func() {
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, Options: []string{"parallel:2"}}
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})

View File

@@ -537,6 +537,16 @@ options:
**Note:** The `parallel` option can also be set via the `LLAMACPP_PARALLEL` environment variable, and `grpc_servers` can be set via the `LLAMACPP_GRPC_SERVERS` environment variable. Options specified in the YAML file take precedence over environment variables.
##### Hardware auto-tuning (and how to override it)
On a detected GPU, LocalAI fills a few performance-relevant defaults the model config leaves unset — a larger physical batch on NVIDIA Blackwell, and a VRAM-scaled `parallel` slot count for concurrent serving. Both are gated on **per-device** VRAM at the model's context: when a large context already fills a single card (e.g. a 27B model with a 200k context across 2×16 GiB), the batch boost and the extra parallel slots are suppressed so they can't tip the tighter GPU into CUDA out-of-memory.
Anything you set explicitly in the model YAML always wins, so to pin a value just set it (e.g. `batch: 512` or `options: ["parallel:1"]`). The effective values are logged at `INFO` when a model loads (`effective runtime tuning …`). To turn the hardware auto-tuning off entirely and run llama.cpp's stock behavior, set:
```
LOCALAI_DISABLE_HARDWARE_DEFAULTS=true
```
##### Server-side prompt cache (repeated system prompts)
Agents, coding assistants, and Anthropic/OpenAI-compatible CLIs typically resend the same large system prompt on every turn. The llama.cpp server can short-circuit prefill for the matching prefix by stashing idle slot KV states in host RAM and reloading them on a hit. Three settings interact:

View File

@@ -169,11 +169,41 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
}
}
// parallelSlotsFromOptions returns the effective n_parallel from the backend
// option strings ("parallel:N" / "n_parallel:N"), or "1" when unset — the
// llama.cpp default. Used only for the effective-tuning load log.
func parallelSlotsFromOptions(opts []string) string {
for _, o := range opts {
k, v, ok := strings.Cut(o, ":")
if ok && (k == "parallel" || k == "n_parallel") {
return strings.TrimSpace(v)
}
}
return "1"
}
func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err error) {
o := NewOptions(opts...)
xlog.Info("BackendLoader starting", "modelID", o.modelID, "backend", o.backendString, "model", o.model)
// Surface the effective performance-relevant runtime options at load (some of
// these are auto-tuned for the detected hardware). Logged once per load so an
// admin can see what will actually run and pin or override any value in the
// model YAML — or set LOCALAI_DISABLE_HARDWARE_DEFAULTS=true to turn the
// hardware auto-tuning off entirely. Gated on an LLM-ish load (context set) so
// TTS/audio/other backends stay quiet.
if opt := o.gRPCOptions; opt != nil && opt.ContextSize > 0 {
xlog.Info("effective runtime tuning (override in the model YAML; LOCALAI_DISABLE_HARDWARE_DEFAULTS=true disables hardware auto-tuning)",
"modelID", o.modelID,
"context", opt.ContextSize,
"n_batch", opt.NBatch,
"n_gpu_layers", opt.NGPULayers,
"parallel", parallelSlotsFromOptions(opt.Options),
"flash_attention", opt.FlashAttention,
"f16", opt.F16Memory)
}
backend := strings.ToLower(o.backendString)
if realBackend, exists := Aliases[backend]; exists {
typeAlias, exists := TypeAlias[backend]

View File

@@ -0,0 +1,19 @@
package model
import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("parallelSlotsFromOptions", func() {
It("reads the parallel slot count from the backend options", func() {
Expect(parallelSlotsFromOptions([]string{"use_jinja:true", "parallel:4"})).To(Equal("4"))
})
It("accepts the n_parallel alias", func() {
Expect(parallelSlotsFromOptions([]string{"n_parallel:8"})).To(Equal("8"))
})
It("defaults to a single slot when unset", func() {
Expect(parallelSlotsFromOptions([]string{"use_jinja:true"})).To(Equal("1"))
Expect(parallelSlotsFromOptions(nil)).To(Equal("1"))
})
})