mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-26 09:26:55 -04:00
The first #10485 fix (#10494) made the Blackwell physical-batch boost per-device/context-aware, which neutralized the big compute-buffer OOM, but the reporter's 2x16 GiB consumer Blackwell still OOM'd. Tracing the post-fix log: the model now loads its weights, builds the main context and warms up fine, and dies only on the *last* allocation — the MTP draft context's 800 MiB KV cache on the tighter device. #10411 changed only two defaults: the physical batch (now gated) and a VRAM-scaled parallel-slot count. The KV cache is unified (n_ctx_seq == full context proves slots share the budget, so parallel doesn't multiply KV), but n_seq_max=4 still adds per-slot compute-graph / context-checkpoint / output scratch. On a device packed ~99% by a 27B model spanning both cards, that overhead is the few-hundred-MiB straw — which is why reverting #10411 (and only #10411) restores a working load. Gate the parallel-slot default on the same per-device headroom predicate as the batch boost: when a large context already fills a single card (largeContextForDevice), keep n_parallel=1. A user running one big-context model that barely fits across two consumer GPUs is not serving four concurrent tenants. Small contexts and large unified-memory devices (GB10) keep full concurrency. Applied on both the single-host path and the distributed router. Also make the auto-tuning visible and reversible (the debugging here needed DEBUG logs and a git bisect): - Log the effective performance-relevant runtime options at INFO once per model load ("effective runtime tuning …": context, n_batch, n_gpu_layers, parallel, flash_attention, f16) so an admin can see what will run and pin or override any value in the model YAML. - LOCALAI_DISABLE_HARDWARE_DEFAULTS=true skips the hardware auto-tuning entirely (mirrors LOCALAI_DISABLE_GUESSING) for stock llama.cpp behavior. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
316 lines
14 KiB
Go
316 lines
14 KiB
Go
package config
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
// HardwareDefaultsDisabled reports whether hardware auto-tuning is turned off via
|
|
// LOCALAI_DISABLE_HARDWARE_DEFAULTS=true (mirrors LOCALAI_DISABLE_GUESSING). When
|
|
// set, ApplyHardwareDefaults and the distributed router's node tuning are
|
|
// skipped entirely, so the backend runs llama.cpp's stock batch/parallel
|
|
// behavior — an escape hatch for users who want predictable, un-tuned defaults.
|
|
func HardwareDefaultsDisabled() bool {
|
|
// Read directly like the sibling LOCALAI_DISABLE_GUESSING toggle in
|
|
// hooks_llamacpp.go: these config-layer heuristic switches run deep in the
|
|
// defaults pipeline with no ApplicationConfig in scope to plumb through.
|
|
//nolint:forbidigo // config-layer heuristic toggle, mirrors LOCALAI_DISABLE_GUESSING
|
|
return os.Getenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS") == "true"
|
|
}
|
|
|
|
// Hardware-driven model-config defaults.
|
|
//
|
|
// This sits alongside the other config overriders (ApplyInferenceDefaults for
|
|
// model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
|
|
// heuristically fill ModelConfig values the user left unset. Hardware tuning is
|
|
// the same domain — "adjust the config from the device that will run it" — so
|
|
// it lives here rather than scattered into the backend or a separate package.
|
|
//
|
|
// The heuristics are parameterized on a GPU descriptor (not on direct
|
|
// detection) so they apply in both deployment shapes: SetDefaults passes the
|
|
// LocalGPU on a single host, and the distributed router passes the *selected
|
|
// node's* reported GPU before loading there (the frontend that loaded the
|
|
// config may have no GPU at all).
|
|
|
|
// GPU describes the device that will run a model.
|
|
type GPU struct {
|
|
// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
|
|
Vendor string
|
|
// ComputeCapability is the NVIDIA compute capability as "major.minor"
|
|
// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
|
|
ComputeCapability string
|
|
// VRAM is total device memory in bytes (0 = unknown).
|
|
VRAM uint64
|
|
}
|
|
|
|
// Physical batch (n_batch / n_ubatch) defaults.
|
|
const (
|
|
// DefaultPhysicalBatch is the conservative default when no hardware-specific
|
|
// tuning applies. core/backend.DefaultBatchSize references this (single source).
|
|
DefaultPhysicalBatch = 512
|
|
// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
|
|
// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
|
|
// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
|
|
// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
|
|
BlackwellPhysicalBatch = 2048
|
|
)
|
|
|
|
// IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
|
|
// family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
|
|
// reports a different compute capability and is intentionally not matched.
|
|
func (g GPU) IsNVIDIABlackwell() bool {
|
|
maj, _ := parseComputeCapability(g.ComputeCapability)
|
|
return maj >= 12
|
|
}
|
|
|
|
// Compute-buffer headroom guard for the raised physical batch.
|
|
//
|
|
// Raising n_ubatch grows the CUDA *compute buffer* (the scratch for the forward
|
|
// graph), which is allocated PER DEVICE — it does not benefit from a second GPU
|
|
// the way weights or KV (which are split across devices) do. The buffer scales
|
|
// ~linearly with n_ubatch * n_ctx, so a large context turns the GB10-tuned
|
|
// ub2048 into multi-GiB of extra scratch that must fit on a SINGLE card. On a
|
|
// 16 GiB consumer Blackwell with a 200k context that overflows (issue #10485),
|
|
// even though the GB10 it was measured on (128 GiB unified memory) had room.
|
|
//
|
|
// These constants size a conservative guard: only raise the batch when the
|
|
// extra scratch fits the per-device VRAM ceiling.
|
|
const (
|
|
// computeBufferBytesPerCell approximates the CUDA compute-buffer cost of one
|
|
// (n_ubatch * n_ctx) cell. Derived from an observed allocation (ub2048 *
|
|
// ctx204800 ~= 4.5 GiB => ~11 B/cell) and rounded up to 16 for margin, since
|
|
// the real cost also grows with model width (heads / embedding dim) which we
|
|
// don't know at config time.
|
|
computeBufferBytesPerCell = 16
|
|
// blackwellBatchHeadroomDivisor caps the extra compute buffer from raising the
|
|
// physical batch at VRAM/divisor. /4 keeps the bulk of a device for weights +
|
|
// KV, which already dominate VRAM use.
|
|
blackwellBatchHeadroomDivisor = 4
|
|
)
|
|
|
|
// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
|
|
// given hardware class, ignoring context/VRAM headroom. Use
|
|
// PhysicalBatchForContext when a model context and per-device VRAM are known
|
|
// (the load paths) so the raised batch can't overflow a single device.
|
|
func PhysicalBatch(g GPU) int {
|
|
if g.IsNVIDIABlackwell() {
|
|
return BlackwellPhysicalBatch
|
|
}
|
|
return DefaultPhysicalBatch
|
|
}
|
|
|
|
// PhysicalBatchForContext is PhysicalBatch gated on per-device VRAM headroom for
|
|
// the given context: it only raises the batch above the conservative default
|
|
// when the extra compute buffer (which is allocated on a single device and grows
|
|
// with n_ubatch * n_ctx) fits within blackwellBatchHeadroomDivisor of the GPU's
|
|
// VRAM. g.VRAM must be the PER-DEVICE ceiling (the smallest device on a
|
|
// multi-GPU host), not the summed total — the compute buffer can't be split.
|
|
//
|
|
// VRAM 0 (unknown) stays conservative rather than risk a per-device OOM; the
|
|
// GB10 / unified-memory path reports system RAM, so it still clears the guard.
|
|
func PhysicalBatchForContext(g GPU, ctx int) int {
|
|
if !g.IsNVIDIABlackwell() {
|
|
return DefaultPhysicalBatch
|
|
}
|
|
if g.VRAM == 0 {
|
|
return DefaultPhysicalBatch
|
|
}
|
|
if largeContextForDevice(g, ctx) {
|
|
return DefaultPhysicalBatch
|
|
}
|
|
return BlackwellPhysicalBatch
|
|
}
|
|
|
|
// largeContextForDevice reports whether the given context is large relative to
|
|
// the per-device VRAM ceiling — the shared "tight single-model fit" signal that
|
|
// suppresses BOTH throughput-oriented defaults (the Blackwell batch boost and
|
|
// the concurrency slot count). It sizes the extra compute-buffer scratch a
|
|
// raised batch would need at this context (which grows ~n_ubatch * n_ctx and
|
|
// is allocated per device) and asks whether it overflows a fraction of the
|
|
// device VRAM; when it does, the device has no headroom to spend on throughput
|
|
// and the conservative defaults must hold (issue #10485).
|
|
//
|
|
// g.VRAM must be the PER-DEVICE ceiling (the smallest device on a multi-GPU
|
|
// host). VRAM 0 (unknown) is treated as not-large so detection gaps don't
|
|
// silently disable the defaults.
|
|
func largeContextForDevice(g GPU, ctx int) bool {
|
|
if g.VRAM == 0 {
|
|
return false
|
|
}
|
|
if ctx <= 0 {
|
|
ctx = DefaultContextSize
|
|
}
|
|
extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
|
|
return extra > g.VRAM/blackwellBatchHeadroomDivisor
|
|
}
|
|
|
|
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
|
|
// Callers that re-tune a value chosen by an upstream host (the distributed
|
|
// router correcting the frontend's guess) use this to avoid clobbering an
|
|
// explicit user batch such as 1024.
|
|
func IsManagedPhysicalBatch(n int) bool {
|
|
return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
|
|
}
|
|
|
|
// Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at
|
|
// n_parallel=1 (the backend default) and only auto-enables continuous batching
|
|
// when n_parallel > 1 — so a single-slot default makes concurrent requests
|
|
// queue. We default a slot count by GPU size so multi-user serving works out of
|
|
// the box. With the backend's unified KV cache the slots SHARE the context
|
|
// budget, so more slots add concurrency without multiplying KV memory.
|
|
const (
|
|
parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots
|
|
parallelSlotsVRAMMid = uint64(8) << 30 // >=8 GiB -> 4 slots
|
|
parallelSlotsVRAMLow = uint64(4) << 30 // >=4 GiB -> 2 slots
|
|
)
|
|
|
|
// DefaultParallelSlots returns the n_parallel default for the given GPU. Returns
|
|
// 1 (no concurrency) when VRAM is unknown or too small, so we never change
|
|
// behavior on CPU-only / tiny devices.
|
|
func DefaultParallelSlots(g GPU) int {
|
|
switch {
|
|
case g.VRAM >= parallelSlotsVRAMHigh:
|
|
return 8
|
|
case g.VRAM >= parallelSlotsVRAMMid:
|
|
return 4
|
|
case g.VRAM >= parallelSlotsVRAMLow:
|
|
return 2
|
|
default:
|
|
return 1
|
|
}
|
|
}
|
|
|
|
// ParallelSlotsForContext is DefaultParallelSlots gated on per-device VRAM
|
|
// headroom for the given context. A large context already claims most of a
|
|
// single device's VRAM (the KV cache plus the per-slot compute/checkpoint
|
|
// scratch that scales with n_seq_max), so defaulting multiple slots there
|
|
// pushes a tight single-model fit into per-device CUDA OOM (issue #10485): the
|
|
// model loads but the final allocation (e.g. an MTP draft context's KV cache)
|
|
// overflows the tighter card by a few hundred MiB. Returns 1 (no concurrency)
|
|
// in that tight regime, otherwise the VRAM-scaled DefaultParallelSlots.
|
|
//
|
|
// g.VRAM must be the PER-DEVICE ceiling (smallest device on a multi-GPU host).
|
|
// It shares largeContextForDevice with the batch boost so both throughput
|
|
// defaults are suppressed together; the GB10 / unified-memory path reports
|
|
// system RAM and so keeps full concurrency even at large contexts.
|
|
func ParallelSlotsForContext(g GPU, ctx int) int {
|
|
slots := DefaultParallelSlots(g)
|
|
if slots <= 1 || g.VRAM == 0 {
|
|
return slots
|
|
}
|
|
if largeContextForDevice(g, ctx) {
|
|
return 1
|
|
}
|
|
return slots
|
|
}
|
|
|
|
// EnsureParallelOptionForContext appends a VRAM-scaled "parallel:N" backend
|
|
// option when the model doesn't already set one and the GPU warrants (and has
|
|
// headroom for) concurrency at this context. Returns the possibly-extended
|
|
// options. Shared by the single-host config path (ApplyHardwareDefaults) and
|
|
// the distributed router (per selected node).
|
|
func EnsureParallelOptionForContext(opts []string, gpu GPU, ctx int) []string {
|
|
if slots := ParallelSlotsForContext(gpu, ctx); slots > 1 && !hasParallelOption(opts) {
|
|
return append(opts, fmt.Sprintf("parallel:%d", slots))
|
|
}
|
|
return opts
|
|
}
|
|
|
|
// EnsureParallelOption is EnsureParallelOptionForContext with no known context
|
|
// (defaults to DefaultContextSize, which clears the headroom gate on any device
|
|
// large enough to warrant concurrency). Kept for callers without a model
|
|
// context.
|
|
func EnsureParallelOption(opts []string, gpu GPU) []string {
|
|
return EnsureParallelOptionForContext(opts, gpu, 0)
|
|
}
|
|
|
|
// hasParallelOption reports whether the model already sets parallel/n_parallel
|
|
// so we never override an explicit value (helper shared with serving_defaults.go).
|
|
func hasParallelOption(opts []string) bool {
|
|
return backendOptionSet(opts, "parallel", "n_parallel")
|
|
}
|
|
|
|
// localGPU builds a GPU descriptor from local detection, used by SetDefaults on
|
|
// a single host (the distributed router builds it from the selected node's
|
|
// reported info instead). It is a package var so tests can inject a
|
|
// deterministic device — detection does a live nvidia-smi call.
|
|
var localGPU = func() GPU {
|
|
vendor, _ := xsysinfo.DetectGPUVendor()
|
|
// Use the SMALLEST device's VRAM, not the summed total: the parallel-slot
|
|
// tier and the batch headroom guard both reason about what fits on a single
|
|
// card, and per-device compute buffers can't be split across GPUs. Summing
|
|
// two 16 GiB cards into "32 GiB" is what over-provisioned multi-GPU hosts
|
|
// into OOM (issue #10485).
|
|
vram, _ := xsysinfo.MinPerGPUVRAM()
|
|
return GPU{
|
|
Vendor: vendor,
|
|
ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
|
|
VRAM: vram,
|
|
}
|
|
}
|
|
|
|
// ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
|
|
// and were left unset by the user. Currently: a larger physical batch on
|
|
// Blackwell. Explicit config always wins (we only touch zero values).
|
|
func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
|
|
if cfg == nil || HardwareDefaultsDisabled() {
|
|
return
|
|
}
|
|
// Raise the physical batch on Blackwell only when the resulting compute
|
|
// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
|
|
// (rather than writing the default 512) preserves the downstream single-pass
|
|
// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
|
|
ctx := DefaultContextSize
|
|
if cfg.ContextSize != nil {
|
|
ctx = *cfg.ContextSize
|
|
}
|
|
if cfg.Batch == 0 {
|
|
if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
|
|
cfg.Batch = BlackwellPhysicalBatch
|
|
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
|
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability, "context", ctx, "vram_gib", gpu.VRAM>>30)
|
|
}
|
|
}
|
|
|
|
// Enable concurrent serving by default on a capable GPU: without this the
|
|
// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
|
|
// (continuous batching stays off). Unified KV means the slots share the
|
|
// context budget, but a context large enough to fill a single device leaves
|
|
// no room for the per-slot scratch, so the slot count is gated on per-device
|
|
// headroom too (issue #10485). Explicit parallel/n_parallel always wins.
|
|
if before := len(cfg.Options); true {
|
|
cfg.Options = EnsureParallelOptionForContext(cfg.Options, gpu, ctx)
|
|
if len(cfg.Options) > before {
|
|
xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
|
|
"option", cfg.Options[len(cfg.Options)-1], "context", ctx, "vram_gib", gpu.VRAM>>30)
|
|
}
|
|
}
|
|
}
|
|
|
|
// parseComputeCapability splits a "major.minor" string into integer parts.
|
|
// Returns (-1, -1) when it can't be parsed.
|
|
func parseComputeCapability(cc string) (int, int) {
|
|
cc = strings.TrimSpace(cc)
|
|
if cc == "" {
|
|
return -1, -1
|
|
}
|
|
majStr, minStr := cc, "0"
|
|
if dot := strings.IndexByte(cc, '.'); dot >= 0 {
|
|
majStr, minStr = cc[:dot], cc[dot+1:]
|
|
}
|
|
maj, err := strconv.Atoi(strings.TrimSpace(majStr))
|
|
if err != nil {
|
|
return -1, -1
|
|
}
|
|
min, err := strconv.Atoi(strings.TrimSpace(minStr))
|
|
if err != nil {
|
|
min = 0
|
|
}
|
|
return maj, min
|
|
}
|