mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-25 09:09:07 -04:00
Compare commits
1 Commits
master
...
fix/blackw
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
87d5734c33 |
@@ -2,6 +2,7 @@ package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@@ -9,6 +10,19 @@ import (
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// HardwareDefaultsDisabled reports whether hardware auto-tuning is turned off via
|
||||
// LOCALAI_DISABLE_HARDWARE_DEFAULTS=true (mirrors LOCALAI_DISABLE_GUESSING). When
|
||||
// set, ApplyHardwareDefaults and the distributed router's node tuning are
|
||||
// skipped entirely, so the backend runs llama.cpp's stock batch/parallel
|
||||
// behavior — an escape hatch for users who want predictable, un-tuned defaults.
|
||||
func HardwareDefaultsDisabled() bool {
|
||||
// Read directly like the sibling LOCALAI_DISABLE_GUESSING toggle in
|
||||
// hooks_llamacpp.go: these config-layer heuristic switches run deep in the
|
||||
// defaults pipeline with no ApplicationConfig in scope to plumb through.
|
||||
//nolint:forbidigo // config-layer heuristic toggle, mirrors LOCALAI_DISABLE_GUESSING
|
||||
return os.Getenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS") == "true"
|
||||
}
|
||||
|
||||
// Hardware-driven model-config defaults.
|
||||
//
|
||||
// This sits alongside the other config overriders (ApplyInferenceDefaults for
|
||||
@@ -103,17 +117,36 @@ func PhysicalBatchForContext(g GPU, ctx int) int {
|
||||
if !g.IsNVIDIABlackwell() {
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
if ctx <= 0 {
|
||||
ctx = DefaultContextSize
|
||||
}
|
||||
if g.VRAM == 0 {
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
|
||||
if extra <= g.VRAM/blackwellBatchHeadroomDivisor {
|
||||
return BlackwellPhysicalBatch
|
||||
if largeContextForDevice(g, ctx) {
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
return DefaultPhysicalBatch
|
||||
return BlackwellPhysicalBatch
|
||||
}
|
||||
|
||||
// largeContextForDevice reports whether the given context is large relative to
|
||||
// the per-device VRAM ceiling — the shared "tight single-model fit" signal that
|
||||
// suppresses BOTH throughput-oriented defaults (the Blackwell batch boost and
|
||||
// the concurrency slot count). It sizes the extra compute-buffer scratch a
|
||||
// raised batch would need at this context (which grows ~n_ubatch * n_ctx and
|
||||
// is allocated per device) and asks whether it overflows a fraction of the
|
||||
// device VRAM; when it does, the device has no headroom to spend on throughput
|
||||
// and the conservative defaults must hold (issue #10485).
|
||||
//
|
||||
// g.VRAM must be the PER-DEVICE ceiling (the smallest device on a multi-GPU
|
||||
// host). VRAM 0 (unknown) is treated as not-large so detection gaps don't
|
||||
// silently disable the defaults.
|
||||
func largeContextForDevice(g GPU, ctx int) bool {
|
||||
if g.VRAM == 0 {
|
||||
return false
|
||||
}
|
||||
if ctx <= 0 {
|
||||
ctx = DefaultContextSize
|
||||
}
|
||||
extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
|
||||
return extra > g.VRAM/blackwellBatchHeadroomDivisor
|
||||
}
|
||||
|
||||
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
|
||||
@@ -152,17 +185,50 @@ func DefaultParallelSlots(g GPU) int {
|
||||
}
|
||||
}
|
||||
|
||||
// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
|
||||
// model doesn't already set one (and the GPU warrants concurrency). Returns the
|
||||
// possibly-extended options. Shared by the single-host config path
|
||||
// (ApplyHardwareDefaults) and the distributed router (per selected node).
|
||||
func EnsureParallelOption(opts []string, gpu GPU) []string {
|
||||
if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
|
||||
// ParallelSlotsForContext is DefaultParallelSlots gated on per-device VRAM
|
||||
// headroom for the given context. A large context already claims most of a
|
||||
// single device's VRAM (the KV cache plus the per-slot compute/checkpoint
|
||||
// scratch that scales with n_seq_max), so defaulting multiple slots there
|
||||
// pushes a tight single-model fit into per-device CUDA OOM (issue #10485): the
|
||||
// model loads but the final allocation (e.g. an MTP draft context's KV cache)
|
||||
// overflows the tighter card by a few hundred MiB. Returns 1 (no concurrency)
|
||||
// in that tight regime, otherwise the VRAM-scaled DefaultParallelSlots.
|
||||
//
|
||||
// g.VRAM must be the PER-DEVICE ceiling (smallest device on a multi-GPU host).
|
||||
// It shares largeContextForDevice with the batch boost so both throughput
|
||||
// defaults are suppressed together; the GB10 / unified-memory path reports
|
||||
// system RAM and so keeps full concurrency even at large contexts.
|
||||
func ParallelSlotsForContext(g GPU, ctx int) int {
|
||||
slots := DefaultParallelSlots(g)
|
||||
if slots <= 1 || g.VRAM == 0 {
|
||||
return slots
|
||||
}
|
||||
if largeContextForDevice(g, ctx) {
|
||||
return 1
|
||||
}
|
||||
return slots
|
||||
}
|
||||
|
||||
// EnsureParallelOptionForContext appends a VRAM-scaled "parallel:N" backend
|
||||
// option when the model doesn't already set one and the GPU warrants (and has
|
||||
// headroom for) concurrency at this context. Returns the possibly-extended
|
||||
// options. Shared by the single-host config path (ApplyHardwareDefaults) and
|
||||
// the distributed router (per selected node).
|
||||
func EnsureParallelOptionForContext(opts []string, gpu GPU, ctx int) []string {
|
||||
if slots := ParallelSlotsForContext(gpu, ctx); slots > 1 && !hasParallelOption(opts) {
|
||||
return append(opts, fmt.Sprintf("parallel:%d", slots))
|
||||
}
|
||||
return opts
|
||||
}
|
||||
|
||||
// EnsureParallelOption is EnsureParallelOptionForContext with no known context
|
||||
// (defaults to DefaultContextSize, which clears the headroom gate on any device
|
||||
// large enough to warrant concurrency). Kept for callers without a model
|
||||
// context.
|
||||
func EnsureParallelOption(opts []string, gpu GPU) []string {
|
||||
return EnsureParallelOptionForContext(opts, gpu, 0)
|
||||
}
|
||||
|
||||
// hasParallelOption reports whether the model already sets parallel/n_parallel
|
||||
// so we never override an explicit value (helper shared with serving_defaults.go).
|
||||
func hasParallelOption(opts []string) bool {
|
||||
@@ -192,18 +258,18 @@ var localGPU = func() GPU {
|
||||
// and were left unset by the user. Currently: a larger physical batch on
|
||||
// Blackwell. Explicit config always wins (we only touch zero values).
|
||||
func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
|
||||
if cfg == nil {
|
||||
if cfg == nil || HardwareDefaultsDisabled() {
|
||||
return
|
||||
}
|
||||
// Raise the physical batch on Blackwell only when the resulting compute
|
||||
// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
|
||||
// (rather than writing the default 512) preserves the downstream single-pass
|
||||
// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
|
||||
ctx := DefaultContextSize
|
||||
if cfg.ContextSize != nil {
|
||||
ctx = *cfg.ContextSize
|
||||
}
|
||||
if cfg.Batch == 0 {
|
||||
ctx := DefaultContextSize
|
||||
if cfg.ContextSize != nil {
|
||||
ctx = *cfg.ContextSize
|
||||
}
|
||||
if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
|
||||
cfg.Batch = BlackwellPhysicalBatch
|
||||
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
||||
@@ -214,13 +280,14 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
|
||||
// Enable concurrent serving by default on a capable GPU: without this the
|
||||
// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
|
||||
// (continuous batching stays off). Unified KV means the slots share the
|
||||
// context budget, so this is concurrency without extra KV memory. Explicit
|
||||
// parallel/n_parallel in the model options always wins.
|
||||
// context budget, but a context large enough to fill a single device leaves
|
||||
// no room for the per-slot scratch, so the slot count is gated on per-device
|
||||
// headroom too (issue #10485). Explicit parallel/n_parallel always wins.
|
||||
if before := len(cfg.Options); true {
|
||||
cfg.Options = EnsureParallelOption(cfg.Options, gpu)
|
||||
cfg.Options = EnsureParallelOptionForContext(cfg.Options, gpu, ctx)
|
||||
if len(cfg.Options) > before {
|
||||
xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
|
||||
"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
|
||||
"option", cfg.Options[len(cfg.Options)-1], "context", ctx, "vram_gib", gpu.VRAM>>30)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,6 +90,15 @@ var _ = Describe("Hardware-driven config defaults", func() {
|
||||
It("no-ops on nil", func() {
|
||||
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
|
||||
})
|
||||
|
||||
It("applies nothing when hardware defaults are disabled via env", func() {
|
||||
GinkgoT().Setenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS", "true")
|
||||
Expect(HardwareDefaultsDisabled()).To(BeTrue())
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
Expect(cfg.Options).To(BeEmpty())
|
||||
})
|
||||
})
|
||||
|
||||
DescribeTable("DefaultParallelSlots (by VRAM)",
|
||||
@@ -105,12 +114,46 @@ var _ = Describe("Hardware-driven config defaults", func() {
|
||||
Entry("unknown 0", uint64(0), 1),
|
||||
)
|
||||
|
||||
Describe("ParallelSlotsForContext (per-device VRAM headroom)", func() {
|
||||
It("keeps the VRAM-scaled slot count when the context fits the device", func() {
|
||||
// 16 GiB card, small context: plenty of room for concurrency.
|
||||
Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 8192)).To(Equal(4))
|
||||
})
|
||||
It("drops to a single slot when a large context already fills the device", func() {
|
||||
// Regression guard for issue #10485: 16 GiB consumer Blackwell, ~200k
|
||||
// context. Even with unified KV, the per-slot compute/checkpoint
|
||||
// scratch from 4 slots is the straw that overflows the tighter device.
|
||||
Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 204800)).To(Equal(1))
|
||||
})
|
||||
It("keeps concurrency on a large unified-memory device (GB10)", func() {
|
||||
// GB10 reports system RAM (~119 GiB): a 200k context leaves headroom.
|
||||
Expect(ParallelSlotsForContext(GPU{VRAM: 119 * gib}, 204800)).To(Equal(8))
|
||||
})
|
||||
It("keeps concurrency on a big datacenter card with a large context", func() {
|
||||
// 80 GiB A100: 200k context is a small fraction, concurrency stays.
|
||||
Expect(ParallelSlotsForContext(GPU{VRAM: 80 * gib}, 204800)).To(Equal(8))
|
||||
})
|
||||
It("stays a single slot on small/unknown VRAM regardless of context", func() {
|
||||
Expect(ParallelSlotsForContext(GPU{VRAM: 2 * gib}, 8192)).To(Equal(1))
|
||||
Expect(ParallelSlotsForContext(GPU{}, 8192)).To(Equal(1))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("ApplyHardwareDefaults parallel slots", func() {
|
||||
It("adds a VRAM-scaled parallel option on a capable GPU", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||
Expect(cfg.Options).To(ContainElement("parallel:8"))
|
||||
})
|
||||
It("adds no parallel option when a large context already fills one device", func() {
|
||||
// Regression guard for issue #10485: 16 GiB card + ~200k context. The
|
||||
// model barely fits; defaulting concurrency tips the tighter GPU into
|
||||
// CUDA OOM during the final (MTP draft) KV allocation.
|
||||
ctx := 204800
|
||||
cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
|
||||
Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
|
||||
})
|
||||
It("scales the slot count down with VRAM", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
|
||||
|
||||
@@ -147,7 +147,7 @@ type scheduleLoadResult struct {
|
||||
// Only values the heuristics themselves manage are touched, so an explicit user
|
||||
// batch (e.g. 1024) is never overridden.
|
||||
func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
|
||||
if opts == nil || node == nil {
|
||||
if opts == nil || node == nil || config.HardwareDefaultsDisabled() {
|
||||
return
|
||||
}
|
||||
gpu := config.GPU{
|
||||
@@ -162,8 +162,11 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
|
||||
opts.NBatch = int32(config.PhysicalBatchForContext(gpu, int(opts.ContextSize)))
|
||||
}
|
||||
// Default concurrent serving for the selected node (the frontend that built
|
||||
// the options may have no GPU). Only adds when no parallel option is set.
|
||||
opts.Options = config.EnsureParallelOption(opts.Options, gpu)
|
||||
// the options may have no GPU). Gated on the node's per-device VRAM at this
|
||||
// model's context, so a large context that already fills the device can't
|
||||
// tip it into OOM by adding slot scratch (issue #10485). Only adds when no
|
||||
// parallel option is set.
|
||||
opts.Options = config.EnsureParallelOptionForContext(opts.Options, gpu, int(opts.ContextSize))
|
||||
}
|
||||
|
||||
// scheduleAndLoad is the shared core for loading a model on a new node.
|
||||
|
||||
@@ -41,6 +41,14 @@ var _ = Describe("applyNodeHardwareDefaults", func() {
|
||||
Expect(opts.Options).To(ContainElement("parallel:8"))
|
||||
})
|
||||
|
||||
It("adds no parallel option when a large context already fills the node device", func() {
|
||||
// Regression guard for issue #10485: a 16 GiB node with a ~200k context
|
||||
// is a tight single-model fit — the slot scratch would tip it into OOM.
|
||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 204800}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.0", TotalVRAM: 16 << 30})
|
||||
Expect(opts.Options).ToNot(ContainElement(ContainSubstring("parallel")))
|
||||
})
|
||||
|
||||
It("never overrides an explicit parallel option on the node path", func() {
|
||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, Options: []string{"parallel:2"}}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
|
||||
|
||||
@@ -537,6 +537,16 @@ options:
|
||||
|
||||
**Note:** The `parallel` option can also be set via the `LLAMACPP_PARALLEL` environment variable, and `grpc_servers` can be set via the `LLAMACPP_GRPC_SERVERS` environment variable. Options specified in the YAML file take precedence over environment variables.
|
||||
|
||||
##### Hardware auto-tuning (and how to override it)
|
||||
|
||||
On a detected GPU, LocalAI fills a few performance-relevant defaults the model config leaves unset — a larger physical batch on NVIDIA Blackwell, and a VRAM-scaled `parallel` slot count for concurrent serving. Both are gated on **per-device** VRAM at the model's context: when a large context already fills a single card (e.g. a 27B model with a 200k context across 2×16 GiB), the batch boost and the extra parallel slots are suppressed so they can't tip the tighter GPU into CUDA out-of-memory.
|
||||
|
||||
Anything you set explicitly in the model YAML always wins, so to pin a value just set it (e.g. `batch: 512` or `options: ["parallel:1"]`). The effective values are logged at `INFO` when a model loads (`effective runtime tuning …`). To turn the hardware auto-tuning off entirely and run llama.cpp's stock behavior, set:
|
||||
|
||||
```
|
||||
LOCALAI_DISABLE_HARDWARE_DEFAULTS=true
|
||||
```
|
||||
|
||||
##### Server-side prompt cache (repeated system prompts)
|
||||
|
||||
Agents, coding assistants, and Anthropic/OpenAI-compatible CLIs typically resend the same large system prompt on every turn. The llama.cpp server can short-circuit prefill for the matching prefix by stashing idle slot KV states in host RAM and reloading them on a hit. Three settings interact:
|
||||
|
||||
@@ -169,11 +169,41 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
||||
}
|
||||
}
|
||||
|
||||
// parallelSlotsFromOptions returns the effective n_parallel from the backend
|
||||
// option strings ("parallel:N" / "n_parallel:N"), or "1" when unset — the
|
||||
// llama.cpp default. Used only for the effective-tuning load log.
|
||||
func parallelSlotsFromOptions(opts []string) string {
|
||||
for _, o := range opts {
|
||||
k, v, ok := strings.Cut(o, ":")
|
||||
if ok && (k == "parallel" || k == "n_parallel") {
|
||||
return strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return "1"
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err error) {
|
||||
o := NewOptions(opts...)
|
||||
|
||||
xlog.Info("BackendLoader starting", "modelID", o.modelID, "backend", o.backendString, "model", o.model)
|
||||
|
||||
// Surface the effective performance-relevant runtime options at load (some of
|
||||
// these are auto-tuned for the detected hardware). Logged once per load so an
|
||||
// admin can see what will actually run and pin or override any value in the
|
||||
// model YAML — or set LOCALAI_DISABLE_HARDWARE_DEFAULTS=true to turn the
|
||||
// hardware auto-tuning off entirely. Gated on an LLM-ish load (context set) so
|
||||
// TTS/audio/other backends stay quiet.
|
||||
if opt := o.gRPCOptions; opt != nil && opt.ContextSize > 0 {
|
||||
xlog.Info("effective runtime tuning (override in the model YAML; LOCALAI_DISABLE_HARDWARE_DEFAULTS=true disables hardware auto-tuning)",
|
||||
"modelID", o.modelID,
|
||||
"context", opt.ContextSize,
|
||||
"n_batch", opt.NBatch,
|
||||
"n_gpu_layers", opt.NGPULayers,
|
||||
"parallel", parallelSlotsFromOptions(opt.Options),
|
||||
"flash_attention", opt.FlashAttention,
|
||||
"f16", opt.F16Memory)
|
||||
}
|
||||
|
||||
backend := strings.ToLower(o.backendString)
|
||||
if realBackend, exists := Aliases[backend]; exists {
|
||||
typeAlias, exists := TypeAlias[backend]
|
||||
|
||||
19
pkg/model/initializers_internal_test.go
Normal file
19
pkg/model/initializers_internal_test.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("parallelSlotsFromOptions", func() {
|
||||
It("reads the parallel slot count from the backend options", func() {
|
||||
Expect(parallelSlotsFromOptions([]string{"use_jinja:true", "parallel:4"})).To(Equal("4"))
|
||||
})
|
||||
It("accepts the n_parallel alias", func() {
|
||||
Expect(parallelSlotsFromOptions([]string{"n_parallel:8"})).To(Equal("8"))
|
||||
})
|
||||
It("defaults to a single slot when unset", func() {
|
||||
Expect(parallelSlotsFromOptions([]string{"use_jinja:true"})).To(Equal("1"))
|
||||
Expect(parallelSlotsFromOptions(nil)).To(Equal("1"))
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user