mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-21 07:08:50 -04:00
feat(config): prefix caching default + consolidate scattered defaults (#10415)
* feat(config): enable cross-request prefix caching for serving (Phase 2) The llama.cpp backend ships n_cache_reuse=0 (cross-request KV prefix reuse via shifting disabled). Enable it by default (256) so repeated prefixes - system prompts, RAG context, agent scaffolds, multi-turn chat - aren't recomputed. This is the universally-useful part of 'paged attention' (shared-prefix reuse, which the upstream maintainers themselves identify as where paged attn actually helps) and needs none of the block-KV machinery. Lives in a serving_defaults.go sibling to hardware_defaults.go (device-driven vs serving-policy defaults); both run from SetDefaults and only fill unset values. Explicit cache_reuse/n_cache_reuse always wins. Device-independent, so it propagates to distributed nodes via the model options with no router change. Shares the backendOptionSet helper with the Phase-1 parallel default. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactor(config): extract generic fallback defaults into ApplyGenericDefaults Behavior-preserving: move the inline sampling-param + runtime-flag fallbacks out of SetDefaults into ApplyGenericDefaults, completing the domain-grouped tiers (ApplyInferenceDefaults=family, ApplyHardwareDefaults=device, ApplyServingDefaults =serving, ApplyGenericDefaults=generic fallbacks). SetDefaults is now a clean orchestrator. Same order (runs after the family/hardware/serving tiers so those win) and same conditions (TopK gated on UsesLlamaSamplerDefaults, MMap on XPU). No behavior change; full config suite green. (NGPULayers stays in the GGUF-read path for now - it's device-driven but coupled to model-size detection; a separate follow-up.) Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
115
core/config/generic_defaults.go
Normal file
115
core/config/generic_defaults.go
Normal file
@@ -0,0 +1,115 @@
|
||||
package config
|
||||
|
||||
import "os"
|
||||
|
||||
// ApplyGenericDefaults fills the generic fallback values applied after the
|
||||
// higher-priority tiers (ApplyInferenceDefaults for the model family,
|
||||
// ApplyHardwareDefaults for the device, ApplyServingDefaults for serving
|
||||
// policy): sampling parameters and a few runtime flags. Like the other tiers it
|
||||
// only fills values still left unset, so model-family / explicit config wins.
|
||||
func ApplyGenericDefaults(cfg *ModelConfig) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
|
||||
defaultTopP := 0.95
|
||||
defaultTopK := 40
|
||||
defaultMinP := 0.0
|
||||
defaultTemp := 0.9
|
||||
// https://github.com/mudler/LocalAI/issues/2780
|
||||
defaultMirostat := 0
|
||||
defaultMirostatTAU := 5.0
|
||||
defaultMirostatETA := 0.1
|
||||
defaultTypicalP := 1.0
|
||||
defaultTFZ := 1.0
|
||||
defaultZero := 0
|
||||
|
||||
trueV := true
|
||||
falseV := false
|
||||
|
||||
if cfg.Seed == nil {
|
||||
// random number generator seed
|
||||
defaultSeed := RAND_SEED
|
||||
cfg.Seed = &defaultSeed
|
||||
}
|
||||
|
||||
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
|
||||
// native default differs (issue #6632). Only inject it for the llama.cpp
|
||||
// family and the empty/auto backend; leave TopK nil for known non-llama
|
||||
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
|
||||
// is 0 rather than a silently-changed 40.
|
||||
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
|
||||
cfg.TopK = &defaultTopK
|
||||
}
|
||||
|
||||
if cfg.MinP == nil {
|
||||
cfg.MinP = &defaultMinP
|
||||
}
|
||||
|
||||
if cfg.TypicalP == nil {
|
||||
cfg.TypicalP = &defaultTypicalP
|
||||
}
|
||||
|
||||
if cfg.TFZ == nil {
|
||||
cfg.TFZ = &defaultTFZ
|
||||
}
|
||||
|
||||
if cfg.MMap == nil {
|
||||
// MMap is enabled by default
|
||||
|
||||
// Only exception is for Intel GPUs
|
||||
if os.Getenv("XPU") != "" {
|
||||
cfg.MMap = &falseV
|
||||
} else {
|
||||
cfg.MMap = &trueV
|
||||
}
|
||||
}
|
||||
|
||||
if cfg.MMlock == nil {
|
||||
// MMlock is disabled by default
|
||||
cfg.MMlock = &falseV
|
||||
}
|
||||
|
||||
if cfg.TopP == nil {
|
||||
cfg.TopP = &defaultTopP
|
||||
}
|
||||
if cfg.Temperature == nil {
|
||||
cfg.Temperature = &defaultTemp
|
||||
}
|
||||
|
||||
if cfg.Maxtokens == nil {
|
||||
cfg.Maxtokens = &defaultZero
|
||||
}
|
||||
|
||||
if cfg.Mirostat == nil {
|
||||
cfg.Mirostat = &defaultMirostat
|
||||
}
|
||||
|
||||
if cfg.MirostatETA == nil {
|
||||
cfg.MirostatETA = &defaultMirostatETA
|
||||
}
|
||||
|
||||
if cfg.MirostatTAU == nil {
|
||||
cfg.MirostatTAU = &defaultMirostatTAU
|
||||
}
|
||||
|
||||
if cfg.LowVRAM == nil {
|
||||
cfg.LowVRAM = &falseV
|
||||
}
|
||||
|
||||
if cfg.Embeddings == nil {
|
||||
cfg.Embeddings = &falseV
|
||||
}
|
||||
|
||||
if cfg.Reranking == nil {
|
||||
cfg.Reranking = &falseV
|
||||
}
|
||||
|
||||
if cfg.PromptCacheAll == nil {
|
||||
// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
|
||||
// and let cache_idle_slots / kv_unified actually do useful work; users can
|
||||
// opt out with an explicit `prompt_cache_all: false` in the model YAML.
|
||||
cfg.PromptCacheAll = &trueV
|
||||
}
|
||||
}
|
||||
36
core/config/generic_defaults_test.go
Normal file
36
core/config/generic_defaults_test.go
Normal file
@@ -0,0 +1,36 @@
|
||||
package config_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("ApplyGenericDefaults (generic fallback tier)", func() {
|
||||
It("fills sampling + runtime fallbacks when unset", func() {
|
||||
cfg := &ModelConfig{} // empty backend uses the llama sampler defaults
|
||||
ApplyGenericDefaults(cfg)
|
||||
Expect(cfg.TopP).ToNot(BeNil())
|
||||
Expect(*cfg.TopP).To(Equal(0.95))
|
||||
Expect(*cfg.TopK).To(Equal(40))
|
||||
Expect(*cfg.Temperature).To(Equal(0.9))
|
||||
Expect(*cfg.MMap).To(BeTrue())
|
||||
Expect(*cfg.MMlock).To(BeFalse())
|
||||
Expect(*cfg.PromptCacheAll).To(BeTrue())
|
||||
})
|
||||
|
||||
It("never overrides explicit values", func() {
|
||||
tk := 7
|
||||
tp := 0.5
|
||||
cfg := &ModelConfig{}
|
||||
cfg.TopK = &tk
|
||||
cfg.TopP = &tp
|
||||
ApplyGenericDefaults(cfg)
|
||||
Expect(*cfg.TopK).To(Equal(7))
|
||||
Expect(*cfg.TopP).To(Equal(0.5))
|
||||
})
|
||||
|
||||
It("no-ops on nil", func() {
|
||||
Expect(func() { ApplyGenericDefaults(nil) }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
@@ -111,19 +111,9 @@ func EnsureParallelOption(opts []string, gpu GPU) []string {
|
||||
}
|
||||
|
||||
// hasParallelOption reports whether the model already sets parallel/n_parallel
|
||||
// (backend options are "name:value" strings) so we never override an explicit value.
|
||||
// so we never override an explicit value (helper shared with serving_defaults.go).
|
||||
func hasParallelOption(opts []string) bool {
|
||||
for _, o := range opts {
|
||||
name := o
|
||||
if i := strings.IndexByte(o, ':'); i >= 0 {
|
||||
name = o[:i]
|
||||
}
|
||||
switch strings.TrimSpace(strings.ToLower(name)) {
|
||||
case "parallel", "n_parallel":
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
return backendOptionSet(opts, "parallel", "n_parallel")
|
||||
}
|
||||
|
||||
// localGPU builds a GPU descriptor from local detection, used by SetDefaults on
|
||||
|
||||
@@ -1126,107 +1126,17 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||
// heuristics for the selected node's GPU before loading. Explicit config wins.
|
||||
ApplyHardwareDefaults(cfg, localGPU())
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
|
||||
defaultTopP := 0.95
|
||||
defaultTopK := 40
|
||||
defaultMinP := 0.0
|
||||
defaultTemp := 0.9
|
||||
// https://github.com/mudler/LocalAI/issues/2780
|
||||
defaultMirostat := 0
|
||||
defaultMirostatTAU := 5.0
|
||||
defaultMirostatETA := 0.1
|
||||
defaultTypicalP := 1.0
|
||||
defaultTFZ := 1.0
|
||||
defaultZero := 0
|
||||
// Apply serving-policy defaults (device-independent): cross-request prefix
|
||||
// caching. Propagates to distributed nodes via the model options.
|
||||
ApplyServingDefaults(cfg)
|
||||
|
||||
// Generic fallback defaults (sampling params + runtime flags), applied after
|
||||
// the model-family / hardware / serving tiers above. Only fills unset values.
|
||||
ApplyGenericDefaults(cfg)
|
||||
|
||||
trueV := true
|
||||
falseV := false
|
||||
|
||||
if cfg.Seed == nil {
|
||||
// random number generator seed
|
||||
defaultSeed := RAND_SEED
|
||||
cfg.Seed = &defaultSeed
|
||||
}
|
||||
|
||||
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
|
||||
// native default differs (issue #6632). Only inject it for the llama.cpp
|
||||
// family and the empty/auto backend; leave TopK nil for known non-llama
|
||||
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
|
||||
// is 0 rather than a silently-changed 40.
|
||||
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
|
||||
cfg.TopK = &defaultTopK
|
||||
}
|
||||
|
||||
if cfg.MinP == nil {
|
||||
cfg.MinP = &defaultMinP
|
||||
}
|
||||
|
||||
if cfg.TypicalP == nil {
|
||||
cfg.TypicalP = &defaultTypicalP
|
||||
}
|
||||
|
||||
if cfg.TFZ == nil {
|
||||
cfg.TFZ = &defaultTFZ
|
||||
}
|
||||
|
||||
if cfg.MMap == nil {
|
||||
// MMap is enabled by default
|
||||
|
||||
// Only exception is for Intel GPUs
|
||||
if os.Getenv("XPU") != "" {
|
||||
cfg.MMap = &falseV
|
||||
} else {
|
||||
cfg.MMap = &trueV
|
||||
}
|
||||
}
|
||||
|
||||
if cfg.MMlock == nil {
|
||||
// MMlock is disabled by default
|
||||
cfg.MMlock = &falseV
|
||||
}
|
||||
|
||||
if cfg.TopP == nil {
|
||||
cfg.TopP = &defaultTopP
|
||||
}
|
||||
if cfg.Temperature == nil {
|
||||
cfg.Temperature = &defaultTemp
|
||||
}
|
||||
|
||||
if cfg.Maxtokens == nil {
|
||||
cfg.Maxtokens = &defaultZero
|
||||
}
|
||||
|
||||
if cfg.Mirostat == nil {
|
||||
cfg.Mirostat = &defaultMirostat
|
||||
}
|
||||
|
||||
if cfg.MirostatETA == nil {
|
||||
cfg.MirostatETA = &defaultMirostatETA
|
||||
}
|
||||
|
||||
if cfg.MirostatTAU == nil {
|
||||
cfg.MirostatTAU = &defaultMirostatTAU
|
||||
}
|
||||
|
||||
if cfg.LowVRAM == nil {
|
||||
cfg.LowVRAM = &falseV
|
||||
}
|
||||
|
||||
if cfg.Embeddings == nil {
|
||||
cfg.Embeddings = &falseV
|
||||
}
|
||||
|
||||
if cfg.Reranking == nil {
|
||||
cfg.Reranking = &falseV
|
||||
}
|
||||
|
||||
if cfg.PromptCacheAll == nil {
|
||||
// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
|
||||
// and let cache_idle_slots / kv_unified actually do useful work; users can
|
||||
// opt out with an explicit `prompt_cache_all: false` in the model YAML.
|
||||
cfg.PromptCacheAll = &trueV
|
||||
}
|
||||
|
||||
if threads == 0 {
|
||||
// Threads can't be 0
|
||||
threads = 4
|
||||
|
||||
56
core/config/serving_defaults.go
Normal file
56
core/config/serving_defaults.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// Serving-policy model-config defaults.
|
||||
//
|
||||
// Sibling to hardware_defaults.go: those fill values driven by the target
|
||||
// *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values
|
||||
// that improve multi-request / multi-user *serving* regardless of the GPU. They
|
||||
// run together from SetDefaults and only ever fill values the user left unset.
|
||||
|
||||
// DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend
|
||||
// reuses across requests via KV-cache shifting. The llama.cpp backend ships this
|
||||
// disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system
|
||||
// prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed.
|
||||
// This is the universally-useful part of "paged attention" (cross-request prefix
|
||||
// sharing) and needs none of the block-KV machinery.
|
||||
const DefaultCacheReuse = 256
|
||||
|
||||
// ApplyServingDefaults fills serving-policy ModelConfig values the user left
|
||||
// unset. Currently: enable cross-request prefix caching. Explicit
|
||||
// cache_reuse/n_cache_reuse in the model options always wins.
|
||||
func ApplyServingDefaults(cfg *ModelConfig) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") {
|
||||
cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse))
|
||||
xlog.Debug("[serving_defaults] enabling cross-request prefix cache",
|
||||
"cache_reuse", DefaultCacheReuse)
|
||||
}
|
||||
}
|
||||
|
||||
// backendOptionSet reports whether the backend options already set any of names.
|
||||
// Options are "name:value" strings (or bare "name"); used so we never override
|
||||
// an explicit value. Shared with hardware_defaults.go.
|
||||
func backendOptionSet(opts []string, names ...string) bool {
|
||||
for _, o := range opts {
|
||||
name := o
|
||||
if i := strings.IndexByte(o, ':'); i >= 0 {
|
||||
name = o[:i]
|
||||
}
|
||||
name = strings.TrimSpace(strings.ToLower(name))
|
||||
for _, n := range names {
|
||||
if name == n {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
30
core/config/serving_defaults_test.go
Normal file
30
core/config/serving_defaults_test.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package config_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("Serving-policy config defaults", func() {
|
||||
Describe("ApplyServingDefaults (cross-request prefix cache)", func() {
|
||||
It("enables cache_reuse when unset", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyServingDefaults(cfg)
|
||||
Expect(cfg.Options).To(ContainElement("cache_reuse:256"))
|
||||
})
|
||||
It("never overrides an explicit cache_reuse", func() {
|
||||
cfg := &ModelConfig{Options: []string{"cache_reuse:0"}}
|
||||
ApplyServingDefaults(cfg)
|
||||
Expect(cfg.Options).To(Equal([]string{"cache_reuse:0"}))
|
||||
})
|
||||
It("recognizes the n_cache_reuse alias", func() {
|
||||
cfg := &ModelConfig{Options: []string{"n_cache_reuse:512"}}
|
||||
ApplyServingDefaults(cfg)
|
||||
Expect(cfg.Options).To(Equal([]string{"n_cache_reuse:512"}))
|
||||
})
|
||||
It("no-ops on nil", func() {
|
||||
Expect(func() { ApplyServingDefaults(nil) }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user