From e0c22e308e8f5ba2b1b4270597f8d97f097b1e26 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 22 May 2026 19:43:03 +0000 Subject: [PATCH] feat(config): default prompt_cache_all to true MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream llama.cpp defaults `cache_prompt = true` (common/common.h), but `parse_options` in the grpc-server backend unconditionally forwards the proto `PromptCacheAll` field, so any model that didn't set `prompt_cache_all: true` in its YAML was getting `cache_prompt=false` — silently overriding llama.cpp's own default. With `kv_unified` and `cache_idle_slots` already on by default, this was the last piece preventing the per-request prompt cache from being usable out of the box. Make `PromptCacheAll` tristate (`*bool`), default it to `true` in `SetDefaults`, and dereference at the proto boundary. Users can still opt out with an explicit `prompt_cache_all: false`. Same pattern as `MMap`, `MMlock`, `Reranking`, etc. Co-Authored-By: Claude Opus 4.7 (1M context) --- core/backend/options.go | 2 +- core/config/hooks_test.go | 32 ++++++++++++++++++++++++++++++++ core/config/model_config.go | 9 ++++++++- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/core/backend/options.go b/core/backend/options.go index ba8cab88b..73985d8fe 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -277,7 +277,7 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions MinP: float32(*c.MinP), Tokens: int32(*c.Maxtokens), Threads: int32(*c.Threads), - PromptCacheAll: c.PromptCacheAll, + PromptCacheAll: *c.PromptCacheAll, PromptCacheRO: c.PromptCacheRO, PromptCachePath: promptCachePath, F16KV: *c.F16, diff --git a/core/config/hooks_test.go b/core/config/hooks_test.go index 12aad2558..94f4ac2e2 100644 --- a/core/config/hooks_test.go +++ b/core/config/hooks_test.go @@ -136,4 +136,36 @@ var _ = Describe("Backend hooks and parser defaults", func() { Expect(cfg.EngineArgs["enable_chunked_prefill"]).To(Equal(true)) }) }) + + Context("PromptCacheAll default", func() { + It("defaults to true when omitted from YAML", func() { + cfg := &ModelConfig{} + cfg.SetDefaults() + + Expect(cfg.PromptCacheAll).NotTo(BeNil()) + Expect(*cfg.PromptCacheAll).To(BeTrue()) + }) + + It("preserves an explicit false from YAML", func() { + falseV := false + cfg := &ModelConfig{ + LLMConfig: LLMConfig{PromptCacheAll: &falseV}, + } + cfg.SetDefaults() + + Expect(cfg.PromptCacheAll).NotTo(BeNil()) + Expect(*cfg.PromptCacheAll).To(BeFalse()) + }) + + It("preserves an explicit true from YAML", func() { + trueV := true + cfg := &ModelConfig{ + LLMConfig: LLMConfig{PromptCacheAll: &trueV}, + } + cfg.SetDefaults() + + Expect(cfg.PromptCacheAll).NotTo(BeNil()) + Expect(*cfg.PromptCacheAll).To(BeTrue()) + }) + }) }) diff --git a/core/config/model_config.go b/core/config/model_config.go index f14bc4a4e..c49a87f4c 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -209,7 +209,7 @@ type LLMConfig struct { RMSNormEps float32 `yaml:"rms_norm_eps,omitempty" json:"rms_norm_eps,omitempty"` NGQA int32 `yaml:"ngqa,omitempty" json:"ngqa,omitempty"` PromptCachePath string `yaml:"prompt_cache_path,omitempty" json:"prompt_cache_path,omitempty"` - PromptCacheAll bool `yaml:"prompt_cache_all,omitempty" json:"prompt_cache_all,omitempty"` + PromptCacheAll *bool `yaml:"prompt_cache_all,omitempty" json:"prompt_cache_all,omitempty"` PromptCacheRO bool `yaml:"prompt_cache_ro,omitempty" json:"prompt_cache_ro,omitempty"` MirostatETA *float64 `yaml:"mirostat_eta,omitempty" json:"mirostat_eta,omitempty"` MirostatTAU *float64 `yaml:"mirostat_tau,omitempty" json:"mirostat_tau,omitempty"` @@ -494,6 +494,13 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) { cfg.Reranking = &falseV } + if cfg.PromptCacheAll == nil { + // Match upstream llama.cpp's default (common/common.h: cache_prompt = true) + // and let cache_idle_slots / kv_unified actually do useful work; users can + // opt out with an explicit `prompt_cache_all: false` in the model YAML. + cfg.PromptCacheAll = &trueV + } + if threads == 0 { // Threads can't be 0 threads = 4