mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-23 16:20:01 -04:00
feat(config): default prompt_cache_all to true
Upstream llama.cpp defaults `cache_prompt = true` (common/common.h), but `parse_options` in the grpc-server backend unconditionally forwards the proto `PromptCacheAll` field, so any model that didn't set `prompt_cache_all: true` in its YAML was getting `cache_prompt=false` — silently overriding llama.cpp's own default. With `kv_unified` and `cache_idle_slots` already on by default, this was the last piece preventing the per-request prompt cache from being usable out of the box. Make `PromptCacheAll` tristate (`*bool`), default it to `true` in `SetDefaults`, and dereference at the proto boundary. Users can still opt out with an explicit `prompt_cache_all: false`. Same pattern as `MMap`, `MMlock`, `Reranking`, etc. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -277,7 +277,7 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
|
||||
MinP: float32(*c.MinP),
|
||||
Tokens: int32(*c.Maxtokens),
|
||||
Threads: int32(*c.Threads),
|
||||
PromptCacheAll: c.PromptCacheAll,
|
||||
PromptCacheAll: *c.PromptCacheAll,
|
||||
PromptCacheRO: c.PromptCacheRO,
|
||||
PromptCachePath: promptCachePath,
|
||||
F16KV: *c.F16,
|
||||
|
||||
@@ -136,4 +136,36 @@ var _ = Describe("Backend hooks and parser defaults", func() {
|
||||
Expect(cfg.EngineArgs["enable_chunked_prefill"]).To(Equal(true))
|
||||
})
|
||||
})
|
||||
|
||||
Context("PromptCacheAll default", func() {
|
||||
It("defaults to true when omitted from YAML", func() {
|
||||
cfg := &ModelConfig{}
|
||||
cfg.SetDefaults()
|
||||
|
||||
Expect(cfg.PromptCacheAll).NotTo(BeNil())
|
||||
Expect(*cfg.PromptCacheAll).To(BeTrue())
|
||||
})
|
||||
|
||||
It("preserves an explicit false from YAML", func() {
|
||||
falseV := false
|
||||
cfg := &ModelConfig{
|
||||
LLMConfig: LLMConfig{PromptCacheAll: &falseV},
|
||||
}
|
||||
cfg.SetDefaults()
|
||||
|
||||
Expect(cfg.PromptCacheAll).NotTo(BeNil())
|
||||
Expect(*cfg.PromptCacheAll).To(BeFalse())
|
||||
})
|
||||
|
||||
It("preserves an explicit true from YAML", func() {
|
||||
trueV := true
|
||||
cfg := &ModelConfig{
|
||||
LLMConfig: LLMConfig{PromptCacheAll: &trueV},
|
||||
}
|
||||
cfg.SetDefaults()
|
||||
|
||||
Expect(cfg.PromptCacheAll).NotTo(BeNil())
|
||||
Expect(*cfg.PromptCacheAll).To(BeTrue())
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
@@ -209,7 +209,7 @@ type LLMConfig struct {
|
||||
RMSNormEps float32 `yaml:"rms_norm_eps,omitempty" json:"rms_norm_eps,omitempty"`
|
||||
NGQA int32 `yaml:"ngqa,omitempty" json:"ngqa,omitempty"`
|
||||
PromptCachePath string `yaml:"prompt_cache_path,omitempty" json:"prompt_cache_path,omitempty"`
|
||||
PromptCacheAll bool `yaml:"prompt_cache_all,omitempty" json:"prompt_cache_all,omitempty"`
|
||||
PromptCacheAll *bool `yaml:"prompt_cache_all,omitempty" json:"prompt_cache_all,omitempty"`
|
||||
PromptCacheRO bool `yaml:"prompt_cache_ro,omitempty" json:"prompt_cache_ro,omitempty"`
|
||||
MirostatETA *float64 `yaml:"mirostat_eta,omitempty" json:"mirostat_eta,omitempty"`
|
||||
MirostatTAU *float64 `yaml:"mirostat_tau,omitempty" json:"mirostat_tau,omitempty"`
|
||||
@@ -494,6 +494,13 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||
cfg.Reranking = &falseV
|
||||
}
|
||||
|
||||
if cfg.PromptCacheAll == nil {
|
||||
// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
|
||||
// and let cache_idle_slots / kv_unified actually do useful work; users can
|
||||
// opt out with an explicit `prompt_cache_all: false` in the model YAML.
|
||||
cfg.PromptCacheAll = &trueV
|
||||
}
|
||||
|
||||
if threads == 0 {
|
||||
// Threads can't be 0
|
||||
threads = 4
|
||||
|
||||
Reference in New Issue
Block a user