feat(config): default prompt_cache_all to true

Upstream llama.cpp defaults `cache_prompt = true` (common/common.h), but `parse_options` in the grpc-server backend unconditionally forwards the proto `PromptCacheAll` field, so any model that didn't set `prompt_cache_all: true` in its YAML was getting `cache_prompt=false` — silently overriding llama.cpp's own default. With `kv_unified` and `cache_idle_slots` already on by default, this was the last piece preventing the per-request prompt cache from being usable out of the box. Make `PromptCacheAll` tristate (`*bool`), default it to `true` in `SetDefaults`, and dereference at the proto boundary. Users can still opt out with an explicit `prompt_cache_all: false`. Same pattern as `MMap`, `MMlock`, `Reranking`, etc. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 15:50:31 -04:00 · 2026-05-22 19:43:03 +00:00
3 changed files with 41 additions and 2 deletions
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -277,7 +277,7 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
 		MinP:                float32(*c.MinP),
 		Tokens:              int32(*c.Maxtokens),
 		Threads:             int32(*c.Threads),
-		PromptCacheAll:      c.PromptCacheAll,
+		PromptCacheAll:      *c.PromptCacheAll,
 		PromptCacheRO:       c.PromptCacheRO,
 		PromptCachePath:     promptCachePath,
 		F16KV:               *c.F16,
--- a/core/config/hooks_test.go
+++ b/core/config/hooks_test.go
@@ -136,4 +136,36 @@ var _ = Describe("Backend hooks and parser defaults", func() {
 			Expect(cfg.EngineArgs["enable_chunked_prefill"]).To(Equal(true))
 		})
 	})
+
+	Context("PromptCacheAll default", func() {
+		It("defaults to true when omitted from YAML", func() {
+			cfg := &ModelConfig{}
+			cfg.SetDefaults()
+
+			Expect(cfg.PromptCacheAll).NotTo(BeNil())
+			Expect(*cfg.PromptCacheAll).To(BeTrue())
+		})
+
+		It("preserves an explicit false from YAML", func() {
+			falseV := false
+			cfg := &ModelConfig{
+				LLMConfig: LLMConfig{PromptCacheAll: &falseV},
+			}
+			cfg.SetDefaults()
+
+			Expect(cfg.PromptCacheAll).NotTo(BeNil())
+			Expect(*cfg.PromptCacheAll).To(BeFalse())
+		})
+
+		It("preserves an explicit true from YAML", func() {
+			trueV := true
+			cfg := &ModelConfig{
+				LLMConfig: LLMConfig{PromptCacheAll: &trueV},
+			}
+			cfg.SetDefaults()
+
+			Expect(cfg.PromptCacheAll).NotTo(BeNil())
+			Expect(*cfg.PromptCacheAll).To(BeTrue())
+		})
+	})
 })
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -209,7 +209,7 @@ type LLMConfig struct {
 	RMSNormEps      float32  `yaml:"rms_norm_eps,omitempty" json:"rms_norm_eps,omitempty"`
 	NGQA            int32    `yaml:"ngqa,omitempty" json:"ngqa,omitempty"`
 	PromptCachePath string   `yaml:"prompt_cache_path,omitempty" json:"prompt_cache_path,omitempty"`
-	PromptCacheAll  bool     `yaml:"prompt_cache_all,omitempty" json:"prompt_cache_all,omitempty"`
+	PromptCacheAll  *bool    `yaml:"prompt_cache_all,omitempty" json:"prompt_cache_all,omitempty"`
 	PromptCacheRO   bool     `yaml:"prompt_cache_ro,omitempty" json:"prompt_cache_ro,omitempty"`
 	MirostatETA     *float64 `yaml:"mirostat_eta,omitempty" json:"mirostat_eta,omitempty"`
 	MirostatTAU     *float64 `yaml:"mirostat_tau,omitempty" json:"mirostat_tau,omitempty"`
@@ -494,6 +494,13 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Reranking = &falseV
 	}

+	if cfg.PromptCacheAll == nil {
+		// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
+		// and let cache_idle_slots / kv_unified actually do useful work; users can
+		// opt out with an explicit `prompt_cache_all: false` in the model YAML.
+		cfg.PromptCacheAll = &trueV
+	}
+
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4