mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-22 07:39:02 -04:00
* feat(config): enable cross-request prefix caching for serving (Phase 2) The llama.cpp backend ships n_cache_reuse=0 (cross-request KV prefix reuse via shifting disabled). Enable it by default (256) so repeated prefixes - system prompts, RAG context, agent scaffolds, multi-turn chat - aren't recomputed. This is the universally-useful part of 'paged attention' (shared-prefix reuse, which the upstream maintainers themselves identify as where paged attn actually helps) and needs none of the block-KV machinery. Lives in a serving_defaults.go sibling to hardware_defaults.go (device-driven vs serving-policy defaults); both run from SetDefaults and only fill unset values. Explicit cache_reuse/n_cache_reuse always wins. Device-independent, so it propagates to distributed nodes via the model options with no router change. Shares the backendOptionSet helper with the Phase-1 parallel default. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactor(config): extract generic fallback defaults into ApplyGenericDefaults Behavior-preserving: move the inline sampling-param + runtime-flag fallbacks out of SetDefaults into ApplyGenericDefaults, completing the domain-grouped tiers (ApplyInferenceDefaults=family, ApplyHardwareDefaults=device, ApplyServingDefaults =serving, ApplyGenericDefaults=generic fallbacks). SetDefaults is now a clean orchestrator. Same order (runs after the family/hardware/serving tiers so those win) and same conditions (TopK gated on UsesLlamaSamplerDefaults, MMap on XPU). No behavior change; full config suite green. (NGPULayers stays in the GGUF-read path for now - it's device-driven but coupled to model-size detection; a separate follow-up.) Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
31 lines
964 B
Go
31 lines
964 B
Go
package config_test
|
|
|
|
import (
|
|
. "github.com/mudler/LocalAI/core/config"
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
var _ = Describe("Serving-policy config defaults", func() {
|
|
Describe("ApplyServingDefaults (cross-request prefix cache)", func() {
|
|
It("enables cache_reuse when unset", func() {
|
|
cfg := &ModelConfig{}
|
|
ApplyServingDefaults(cfg)
|
|
Expect(cfg.Options).To(ContainElement("cache_reuse:256"))
|
|
})
|
|
It("never overrides an explicit cache_reuse", func() {
|
|
cfg := &ModelConfig{Options: []string{"cache_reuse:0"}}
|
|
ApplyServingDefaults(cfg)
|
|
Expect(cfg.Options).To(Equal([]string{"cache_reuse:0"}))
|
|
})
|
|
It("recognizes the n_cache_reuse alias", func() {
|
|
cfg := &ModelConfig{Options: []string{"n_cache_reuse:512"}}
|
|
ApplyServingDefaults(cfg)
|
|
Expect(cfg.Options).To(Equal([]string{"n_cache_reuse:512"}))
|
|
})
|
|
It("no-ops on nil", func() {
|
|
Expect(func() { ApplyServingDefaults(nil) }).ToNot(Panic())
|
|
})
|
|
})
|
|
})
|