mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-22 07:39:02 -04:00
* feat(config): enable cross-request prefix caching for serving (Phase 2) The llama.cpp backend ships n_cache_reuse=0 (cross-request KV prefix reuse via shifting disabled). Enable it by default (256) so repeated prefixes - system prompts, RAG context, agent scaffolds, multi-turn chat - aren't recomputed. This is the universally-useful part of 'paged attention' (shared-prefix reuse, which the upstream maintainers themselves identify as where paged attn actually helps) and needs none of the block-KV machinery. Lives in a serving_defaults.go sibling to hardware_defaults.go (device-driven vs serving-policy defaults); both run from SetDefaults and only fill unset values. Explicit cache_reuse/n_cache_reuse always wins. Device-independent, so it propagates to distributed nodes via the model options with no router change. Shares the backendOptionSet helper with the Phase-1 parallel default. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactor(config): extract generic fallback defaults into ApplyGenericDefaults Behavior-preserving: move the inline sampling-param + runtime-flag fallbacks out of SetDefaults into ApplyGenericDefaults, completing the domain-grouped tiers (ApplyInferenceDefaults=family, ApplyHardwareDefaults=device, ApplyServingDefaults =serving, ApplyGenericDefaults=generic fallbacks). SetDefaults is now a clean orchestrator. Same order (runs after the family/hardware/serving tiers so those win) and same conditions (TopK gated on UsesLlamaSamplerDefaults, MMap on XPU). No behavior change; full config suite green. (NGPULayers stays in the GGUF-read path for now - it's device-driven but coupled to model-size detection; a separate follow-up.) Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
116 lines
2.7 KiB
Go
116 lines
2.7 KiB
Go
package config
|
|
|
|
import "os"
|
|
|
|
// ApplyGenericDefaults fills the generic fallback values applied after the
|
|
// higher-priority tiers (ApplyInferenceDefaults for the model family,
|
|
// ApplyHardwareDefaults for the device, ApplyServingDefaults for serving
|
|
// policy): sampling parameters and a few runtime flags. Like the other tiers it
|
|
// only fills values still left unset, so model-family / explicit config wins.
|
|
func ApplyGenericDefaults(cfg *ModelConfig) {
|
|
if cfg == nil {
|
|
return
|
|
}
|
|
|
|
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
|
|
defaultTopP := 0.95
|
|
defaultTopK := 40
|
|
defaultMinP := 0.0
|
|
defaultTemp := 0.9
|
|
// https://github.com/mudler/LocalAI/issues/2780
|
|
defaultMirostat := 0
|
|
defaultMirostatTAU := 5.0
|
|
defaultMirostatETA := 0.1
|
|
defaultTypicalP := 1.0
|
|
defaultTFZ := 1.0
|
|
defaultZero := 0
|
|
|
|
trueV := true
|
|
falseV := false
|
|
|
|
if cfg.Seed == nil {
|
|
// random number generator seed
|
|
defaultSeed := RAND_SEED
|
|
cfg.Seed = &defaultSeed
|
|
}
|
|
|
|
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
|
|
// native default differs (issue #6632). Only inject it for the llama.cpp
|
|
// family and the empty/auto backend; leave TopK nil for known non-llama
|
|
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
|
|
// is 0 rather than a silently-changed 40.
|
|
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
|
|
cfg.TopK = &defaultTopK
|
|
}
|
|
|
|
if cfg.MinP == nil {
|
|
cfg.MinP = &defaultMinP
|
|
}
|
|
|
|
if cfg.TypicalP == nil {
|
|
cfg.TypicalP = &defaultTypicalP
|
|
}
|
|
|
|
if cfg.TFZ == nil {
|
|
cfg.TFZ = &defaultTFZ
|
|
}
|
|
|
|
if cfg.MMap == nil {
|
|
// MMap is enabled by default
|
|
|
|
// Only exception is for Intel GPUs
|
|
if os.Getenv("XPU") != "" {
|
|
cfg.MMap = &falseV
|
|
} else {
|
|
cfg.MMap = &trueV
|
|
}
|
|
}
|
|
|
|
if cfg.MMlock == nil {
|
|
// MMlock is disabled by default
|
|
cfg.MMlock = &falseV
|
|
}
|
|
|
|
if cfg.TopP == nil {
|
|
cfg.TopP = &defaultTopP
|
|
}
|
|
if cfg.Temperature == nil {
|
|
cfg.Temperature = &defaultTemp
|
|
}
|
|
|
|
if cfg.Maxtokens == nil {
|
|
cfg.Maxtokens = &defaultZero
|
|
}
|
|
|
|
if cfg.Mirostat == nil {
|
|
cfg.Mirostat = &defaultMirostat
|
|
}
|
|
|
|
if cfg.MirostatETA == nil {
|
|
cfg.MirostatETA = &defaultMirostatETA
|
|
}
|
|
|
|
if cfg.MirostatTAU == nil {
|
|
cfg.MirostatTAU = &defaultMirostatTAU
|
|
}
|
|
|
|
if cfg.LowVRAM == nil {
|
|
cfg.LowVRAM = &falseV
|
|
}
|
|
|
|
if cfg.Embeddings == nil {
|
|
cfg.Embeddings = &falseV
|
|
}
|
|
|
|
if cfg.Reranking == nil {
|
|
cfg.Reranking = &falseV
|
|
}
|
|
|
|
if cfg.PromptCacheAll == nil {
|
|
// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
|
|
// and let cache_idle_slots / kv_unified actually do useful work; users can
|
|
// opt out with an explicit `prompt_cache_all: false` in the model YAML.
|
|
cfg.PromptCacheAll = &trueV
|
|
}
|
|
}
|