From 87d5734c332d3879207145b2073838ce3df3d835 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 25 Jun 2026 12:56:01 +0000
Subject: [PATCH] fix(config): gate parallel-slot default on per-device VRAM
 too (#10485)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The first #10485 fix (#10494) made the Blackwell physical-batch boost
per-device/context-aware, which neutralized the big compute-buffer OOM, but
the reporter's 2x16 GiB consumer Blackwell still OOM'd. Tracing the post-fix
log: the model now loads its weights, builds the main context and warms up
fine, and dies only on the *last* allocation — the MTP draft context's 800 MiB
KV cache on the tighter device.

#10411 changed only two defaults: the physical batch (now gated) and a
VRAM-scaled parallel-slot count. The KV cache is unified (n_ctx_seq == full
context proves slots share the budget, so parallel doesn't multiply KV), but
n_seq_max=4 still adds per-slot compute-graph / context-checkpoint / output
scratch. On a device packed ~99% by a 27B model spanning both cards, that
overhead is the few-hundred-MiB straw — which is why reverting #10411 (and only
#10411) restores a working load.

Gate the parallel-slot default on the same per-device headroom predicate as the
batch boost: when a large context already fills a single card
(largeContextForDevice), keep n_parallel=1. A user running one big-context model
that barely fits across two consumer GPUs is not serving four concurrent
tenants. Small contexts and large unified-memory devices (GB10) keep full
concurrency. Applied on both the single-host path and the distributed router.

Also make the auto-tuning visible and reversible (the debugging here needed
DEBUG logs and a git bisect):

  - Log the effective performance-relevant runtime options at INFO once per
    model load ("effective runtime tuning …": context, n_batch, n_gpu_layers,
    parallel, flash_attention, f16) so an admin can see what will run and pin or
    override any value in the model YAML.
  - LOCALAI_DISABLE_HARDWARE_DEFAULTS=true skips the hardware auto-tuning
    entirely (mirrors LOCALAI_DISABLE_GUESSING) for stock llama.cpp behavior.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]
---
 core/config/hardware_defaults.go              | 111 ++++++++++++++----
 core/config/hardware_defaults_test.go         |  43 +++++++
 core/services/nodes/router.go                 |   9 +-
 .../nodes/router_hardware_internal_test.go    |   8 ++
 docs/content/features/text-generation.md      |  10 ++
 pkg/model/initializers.go                     |  30 +++++
 pkg/model/initializers_internal_test.go       |  19 +++
 7 files changed, 205 insertions(+), 25 deletions(-)
 create mode 100644 pkg/model/initializers_internal_test.go

diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go
index b4e0e74c6..81bc9fc7f 100644
--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -2,6 +2,7 @@ package config
 
 import (
 	"fmt"
+	"os"
 	"strconv"
 	"strings"
 
@@ -9,6 +10,19 @@ import (
 	"github.com/mudler/xlog"
 )
 
+// HardwareDefaultsDisabled reports whether hardware auto-tuning is turned off via
+// LOCALAI_DISABLE_HARDWARE_DEFAULTS=true (mirrors LOCALAI_DISABLE_GUESSING). When
+// set, ApplyHardwareDefaults and the distributed router's node tuning are
+// skipped entirely, so the backend runs llama.cpp's stock batch/parallel
+// behavior — an escape hatch for users who want predictable, un-tuned defaults.
+func HardwareDefaultsDisabled() bool {
+	// Read directly like the sibling LOCALAI_DISABLE_GUESSING toggle in
+	// hooks_llamacpp.go: these config-layer heuristic switches run deep in the
+	// defaults pipeline with no ApplicationConfig in scope to plumb through.
+	//nolint:forbidigo // config-layer heuristic toggle, mirrors LOCALAI_DISABLE_GUESSING
+	return os.Getenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS") == "true"
+}
+
 // Hardware-driven model-config defaults.
 //
 // This sits alongside the other config overriders (ApplyInferenceDefaults for
@@ -103,17 +117,36 @@ func PhysicalBatchForContext(g GPU, ctx int) int {
 	if !g.IsNVIDIABlackwell() {
 		return DefaultPhysicalBatch
 	}
-	if ctx <= 0 {
-		ctx = DefaultContextSize
-	}
 	if g.VRAM == 0 {
 		return DefaultPhysicalBatch
 	}
-	extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
-	if extra <= g.VRAM/blackwellBatchHeadroomDivisor {
-		return BlackwellPhysicalBatch
+	if largeContextForDevice(g, ctx) {
+		return DefaultPhysicalBatch
 	}
-	return DefaultPhysicalBatch
+	return BlackwellPhysicalBatch
+}
+
+// largeContextForDevice reports whether the given context is large relative to
+// the per-device VRAM ceiling — the shared "tight single-model fit" signal that
+// suppresses BOTH throughput-oriented defaults (the Blackwell batch boost and
+// the concurrency slot count). It sizes the extra compute-buffer scratch a
+// raised batch would need at this context (which grows ~n_ubatch * n_ctx and
+// is allocated per device) and asks whether it overflows a fraction of the
+// device VRAM; when it does, the device has no headroom to spend on throughput
+// and the conservative defaults must hold (issue #10485).
+//
+// g.VRAM must be the PER-DEVICE ceiling (the smallest device on a multi-GPU
+// host). VRAM 0 (unknown) is treated as not-large so detection gaps don't
+// silently disable the defaults.
+func largeContextForDevice(g GPU, ctx int) bool {
+	if g.VRAM == 0 {
+		return false
+	}
+	if ctx <= 0 {
+		ctx = DefaultContextSize
+	}
+	extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
+	return extra > g.VRAM/blackwellBatchHeadroomDivisor
 }
 
 // IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
@@ -152,17 +185,50 @@ func DefaultParallelSlots(g GPU) int {
 	}
 }
 
-// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
-// model doesn't already set one (and the GPU warrants concurrency). Returns the
-// possibly-extended options. Shared by the single-host config path
-// (ApplyHardwareDefaults) and the distributed router (per selected node).
-func EnsureParallelOption(opts []string, gpu GPU) []string {
-	if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
+// ParallelSlotsForContext is DefaultParallelSlots gated on per-device VRAM
+// headroom for the given context. A large context already claims most of a
+// single device's VRAM (the KV cache plus the per-slot compute/checkpoint
+// scratch that scales with n_seq_max), so defaulting multiple slots there
+// pushes a tight single-model fit into per-device CUDA OOM (issue #10485): the
+// model loads but the final allocation (e.g. an MTP draft context's KV cache)
+// overflows the tighter card by a few hundred MiB. Returns 1 (no concurrency)
+// in that tight regime, otherwise the VRAM-scaled DefaultParallelSlots.
+//
+// g.VRAM must be the PER-DEVICE ceiling (smallest device on a multi-GPU host).
+// It shares largeContextForDevice with the batch boost so both throughput
+// defaults are suppressed together; the GB10 / unified-memory path reports
+// system RAM and so keeps full concurrency even at large contexts.
+func ParallelSlotsForContext(g GPU, ctx int) int {
+	slots := DefaultParallelSlots(g)
+	if slots <= 1 || g.VRAM == 0 {
+		return slots
+	}
+	if largeContextForDevice(g, ctx) {
+		return 1
+	}
+	return slots
+}
+
+// EnsureParallelOptionForContext appends a VRAM-scaled "parallel:N" backend
+// option when the model doesn't already set one and the GPU warrants (and has
+// headroom for) concurrency at this context. Returns the possibly-extended
+// options. Shared by the single-host config path (ApplyHardwareDefaults) and
+// the distributed router (per selected node).
+func EnsureParallelOptionForContext(opts []string, gpu GPU, ctx int) []string {
+	if slots := ParallelSlotsForContext(gpu, ctx); slots > 1 && !hasParallelOption(opts) {
 		return append(opts, fmt.Sprintf("parallel:%d", slots))
 	}
 	return opts
 }
 
+// EnsureParallelOption is EnsureParallelOptionForContext with no known context
+// (defaults to DefaultContextSize, which clears the headroom gate on any device
+// large enough to warrant concurrency). Kept for callers without a model
+// context.
+func EnsureParallelOption(opts []string, gpu GPU) []string {
+	return EnsureParallelOptionForContext(opts, gpu, 0)
+}
+
 // hasParallelOption reports whether the model already sets parallel/n_parallel
 // so we never override an explicit value (helper shared with serving_defaults.go).
 func hasParallelOption(opts []string) bool {
@@ -192,18 +258,18 @@ var localGPU = func() GPU {
 // and were left unset by the user. Currently: a larger physical batch on
 // Blackwell. Explicit config always wins (we only touch zero values).
 func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
-	if cfg == nil {
+	if cfg == nil || HardwareDefaultsDisabled() {
 		return
 	}
 	// Raise the physical batch on Blackwell only when the resulting compute
 	// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
 	// (rather than writing the default 512) preserves the downstream single-pass
 	// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
+	ctx := DefaultContextSize
+	if cfg.ContextSize != nil {
+		ctx = *cfg.ContextSize
+	}
 	if cfg.Batch == 0 {
-		ctx := DefaultContextSize
-		if cfg.ContextSize != nil {
-			ctx = *cfg.ContextSize
-		}
 		if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
 			cfg.Batch = BlackwellPhysicalBatch
 			xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
@@ -214,13 +280,14 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
 	// Enable concurrent serving by default on a capable GPU: without this the
 	// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
 	// (continuous batching stays off). Unified KV means the slots share the
-	// context budget, so this is concurrency without extra KV memory. Explicit
-	// parallel/n_parallel in the model options always wins.
+	// context budget, but a context large enough to fill a single device leaves
+	// no room for the per-slot scratch, so the slot count is gated on per-device
+	// headroom too (issue #10485). Explicit parallel/n_parallel always wins.
 	if before := len(cfg.Options); true {
-		cfg.Options = EnsureParallelOption(cfg.Options, gpu)
+		cfg.Options = EnsureParallelOptionForContext(cfg.Options, gpu, ctx)
 		if len(cfg.Options) > before {
 			xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
-				"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
+				"option", cfg.Options[len(cfg.Options)-1], "context", ctx, "vram_gib", gpu.VRAM>>30)
 		}
 	}
 }
diff --git a/core/config/hardware_defaults_test.go b/core/config/hardware_defaults_test.go
index 3bc1bf297..452a5a884 100644
--- a/core/config/hardware_defaults_test.go
+++ b/core/config/hardware_defaults_test.go
@@ -90,6 +90,15 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		It("no-ops on nil", func() {
 			Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
 		})
+
+		It("applies nothing when hardware defaults are disabled via env", func() {
+			GinkgoT().Setenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS", "true")
+			Expect(HardwareDefaultsDisabled()).To(BeTrue())
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
+			Expect(cfg.Batch).To(Equal(0))
+			Expect(cfg.Options).To(BeEmpty())
+		})
 	})
 
 	DescribeTable("DefaultParallelSlots (by VRAM)",
@@ -105,12 +114,46 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		Entry("unknown 0", uint64(0), 1),
 	)
 
+	Describe("ParallelSlotsForContext (per-device VRAM headroom)", func() {
+		It("keeps the VRAM-scaled slot count when the context fits the device", func() {
+			// 16 GiB card, small context: plenty of room for concurrency.
+			Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 8192)).To(Equal(4))
+		})
+		It("drops to a single slot when a large context already fills the device", func() {
+			// Regression guard for issue #10485: 16 GiB consumer Blackwell, ~200k
+			// context. Even with unified KV, the per-slot compute/checkpoint
+			// scratch from 4 slots is the straw that overflows the tighter device.
+			Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 204800)).To(Equal(1))
+		})
+		It("keeps concurrency on a large unified-memory device (GB10)", func() {
+			// GB10 reports system RAM (~119 GiB): a 200k context leaves headroom.
+			Expect(ParallelSlotsForContext(GPU{VRAM: 119 * gib}, 204800)).To(Equal(8))
+		})
+		It("keeps concurrency on a big datacenter card with a large context", func() {
+			// 80 GiB A100: 200k context is a small fraction, concurrency stays.
+			Expect(ParallelSlotsForContext(GPU{VRAM: 80 * gib}, 204800)).To(Equal(8))
+		})
+		It("stays a single slot on small/unknown VRAM regardless of context", func() {
+			Expect(ParallelSlotsForContext(GPU{VRAM: 2 * gib}, 8192)).To(Equal(1))
+			Expect(ParallelSlotsForContext(GPU{}, 8192)).To(Equal(1))
+		})
+	})
+
 	Describe("ApplyHardwareDefaults parallel slots", func() {
 		It("adds a VRAM-scaled parallel option on a capable GPU", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
 			Expect(cfg.Options).To(ContainElement("parallel:8"))
 		})
+		It("adds no parallel option when a large context already fills one device", func() {
+			// Regression guard for issue #10485: 16 GiB card + ~200k context. The
+			// model barely fits; defaulting concurrency tips the tighter GPU into
+			// CUDA OOM during the final (MTP draft) KV allocation.
+			ctx := 204800
+			cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
+			Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
+		})
 		It("scales the slot count down with VRAM", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
diff --git a/core/services/nodes/router.go b/core/services/nodes/router.go
index 6ad550cf1..ce3de3290 100644
--- a/core/services/nodes/router.go
+++ b/core/services/nodes/router.go
@@ -147,7 +147,7 @@ type scheduleLoadResult struct {
 // Only values the heuristics themselves manage are touched, so an explicit user
 // batch (e.g. 1024) is never overridden.
 func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
-	if opts == nil || node == nil {
+	if opts == nil || node == nil || config.HardwareDefaultsDisabled() {
 		return
 	}
 	gpu := config.GPU{
@@ -162,8 +162,11 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
 		opts.NBatch = int32(config.PhysicalBatchForContext(gpu, int(opts.ContextSize)))
 	}
 	// Default concurrent serving for the selected node (the frontend that built
-	// the options may have no GPU). Only adds when no parallel option is set.
-	opts.Options = config.EnsureParallelOption(opts.Options, gpu)
+	// the options may have no GPU). Gated on the node's per-device VRAM at this
+	// model's context, so a large context that already fills the device can't
+	// tip it into OOM by adding slot scratch (issue #10485). Only adds when no
+	// parallel option is set.
+	opts.Options = config.EnsureParallelOptionForContext(opts.Options, gpu, int(opts.ContextSize))
 }
 
 // scheduleAndLoad is the shared core for loading a model on a new node.
diff --git a/core/services/nodes/router_hardware_internal_test.go b/core/services/nodes/router_hardware_internal_test.go
index d8576c4e4..084222fee 100644
--- a/core/services/nodes/router_hardware_internal_test.go
+++ b/core/services/nodes/router_hardware_internal_test.go
@@ -41,6 +41,14 @@ var _ = Describe("applyNodeHardwareDefaults", func() {
 		Expect(opts.Options).To(ContainElement("parallel:8"))
 	})
 
+	It("adds no parallel option when a large context already fills the node device", func() {
+		// Regression guard for issue #10485: a 16 GiB node with a ~200k context
+		// is a tight single-model fit — the slot scratch would tip it into OOM.
+		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 204800}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.0", TotalVRAM: 16 << 30})
+		Expect(opts.Options).ToNot(ContainElement(ContainSubstring("parallel")))
+	})
+
 	It("never overrides an explicit parallel option on the node path", func() {
 		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, Options: []string{"parallel:2"}}
 		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md
index c09717a3f..cadc67808 100644
--- a/docs/content/features/text-generation.md
+++ b/docs/content/features/text-generation.md
@@ -537,6 +537,16 @@ options:
 
 **Note:** The `parallel` option can also be set via the `LLAMACPP_PARALLEL` environment variable, and `grpc_servers` can be set via the `LLAMACPP_GRPC_SERVERS` environment variable. Options specified in the YAML file take precedence over environment variables.
 
+##### Hardware auto-tuning (and how to override it)
+
+On a detected GPU, LocalAI fills a few performance-relevant defaults the model config leaves unset — a larger physical batch on NVIDIA Blackwell, and a VRAM-scaled `parallel` slot count for concurrent serving. Both are gated on **per-device** VRAM at the model's context: when a large context already fills a single card (e.g. a 27B model with a 200k context across 2×16 GiB), the batch boost and the extra parallel slots are suppressed so they can't tip the tighter GPU into CUDA out-of-memory.
+
+Anything you set explicitly in the model YAML always wins, so to pin a value just set it (e.g. `batch: 512` or `options: ["parallel:1"]`). The effective values are logged at `INFO` when a model loads (`effective runtime tuning …`). To turn the hardware auto-tuning off entirely and run llama.cpp's stock behavior, set:
+
+```
+LOCALAI_DISABLE_HARDWARE_DEFAULTS=true
+```
+
 ##### Server-side prompt cache (repeated system prompts)
 
 Agents, coding assistants, and Anthropic/OpenAI-compatible CLIs typically resend the same large system prompt on every turn. The llama.cpp server can short-circuit prefill for the matching prefix by stashing idle slot KV states in host RAM and reloading them on a hit. Three settings interact:
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index fdae562fe..509e58e68 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -169,11 +169,41 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 	}
 }
 
+// parallelSlotsFromOptions returns the effective n_parallel from the backend
+// option strings ("parallel:N" / "n_parallel:N"), or "1" when unset — the
+// llama.cpp default. Used only for the effective-tuning load log.
+func parallelSlotsFromOptions(opts []string) string {
+	for _, o := range opts {
+		k, v, ok := strings.Cut(o, ":")
+		if ok && (k == "parallel" || k == "n_parallel") {
+			return strings.TrimSpace(v)
+		}
+	}
+	return "1"
+}
+
 func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err error) {
 	o := NewOptions(opts...)
 
 	xlog.Info("BackendLoader starting", "modelID", o.modelID, "backend", o.backendString, "model", o.model)
 
+	// Surface the effective performance-relevant runtime options at load (some of
+	// these are auto-tuned for the detected hardware). Logged once per load so an
+	// admin can see what will actually run and pin or override any value in the
+	// model YAML — or set LOCALAI_DISABLE_HARDWARE_DEFAULTS=true to turn the
+	// hardware auto-tuning off entirely. Gated on an LLM-ish load (context set) so
+	// TTS/audio/other backends stay quiet.
+	if opt := o.gRPCOptions; opt != nil && opt.ContextSize > 0 {
+		xlog.Info("effective runtime tuning (override in the model YAML; LOCALAI_DISABLE_HARDWARE_DEFAULTS=true disables hardware auto-tuning)",
+			"modelID", o.modelID,
+			"context", opt.ContextSize,
+			"n_batch", opt.NBatch,
+			"n_gpu_layers", opt.NGPULayers,
+			"parallel", parallelSlotsFromOptions(opt.Options),
+			"flash_attention", opt.FlashAttention,
+			"f16", opt.F16Memory)
+	}
+
 	backend := strings.ToLower(o.backendString)
 	if realBackend, exists := Aliases[backend]; exists {
 		typeAlias, exists := TypeAlias[backend]
diff --git a/pkg/model/initializers_internal_test.go b/pkg/model/initializers_internal_test.go
new file mode 100644
index 000000000..6988f1aa2
--- /dev/null
+++ b/pkg/model/initializers_internal_test.go
@@ -0,0 +1,19 @@
+package model
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("parallelSlotsFromOptions", func() {
+	It("reads the parallel slot count from the backend options", func() {
+		Expect(parallelSlotsFromOptions([]string{"use_jinja:true", "parallel:4"})).To(Equal("4"))
+	})
+	It("accepts the n_parallel alias", func() {
+		Expect(parallelSlotsFromOptions([]string{"n_parallel:8"})).To(Equal("8"))
+	})
+	It("defaults to a single slot when unset", func() {
+		Expect(parallelSlotsFromOptions([]string{"use_jinja:true"})).To(Equal("1"))
+		Expect(parallelSlotsFromOptions(nil)).To(Equal("1"))
+	})
+})