feat(config): enable cross-request prefix caching for serving (Phase 2)

The llama.cpp backend ships n_cache_reuse=0 (cross-request KV prefix reuse via shifting disabled). Enable it by default (256) so repeated prefixes - system prompts, RAG context, agent scaffolds, multi-turn chat - aren't recomputed. This is the universally-useful part of 'paged attention' (shared-prefix reuse, which the upstream maintainers themselves identify as where paged attn actually helps) and needs none of the block-KV machinery. Lives in a serving_defaults.go sibling to hardware_defaults.go (device-driven vs serving-policy defaults); both run from SetDefaults and only fill unset values. Explicit cache_reuse/n_cache_reuse always wins. Device-independent, so it propagates to distributed nodes via the model options with no router change. Shares the backendOptionSet helper with the Phase-1 parallel default. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-20 14:49:09 -04:00 · 2026-06-20 13:01:23 +00:00
parent e19c43cf04
commit 94b6cd6355
4 changed files with 92 additions and 12 deletions
--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -111,19 +111,9 @@ func EnsureParallelOption(opts []string, gpu GPU) []string {
 }

 // hasParallelOption reports whether the model already sets parallel/n_parallel
-// (backend options are "name:value" strings) so we never override an explicit value.
+// so we never override an explicit value (helper shared with serving_defaults.go).
 func hasParallelOption(opts []string) bool {
-	for _, o := range opts {
-		name := o
-		if i := strings.IndexByte(o, ':'); i >= 0 {
-			name = o[:i]
-		}
-		switch strings.TrimSpace(strings.ToLower(name)) {
-		case "parallel", "n_parallel":
-			return true
-		}
-	}
-	return false
+	return backendOptionSet(opts, "parallel", "n_parallel")
 }

 // localGPU builds a GPU descriptor from local detection, used by SetDefaults on
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -1116,6 +1116,10 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// heuristics for the selected node's GPU before loading. Explicit config wins.
 	ApplyHardwareDefaults(cfg, localGPU())

+	// Apply serving-policy defaults (device-independent): cross-request prefix
+	// caching. Propagates to distributed nodes via the model options.
+	ApplyServingDefaults(cfg)
+
 	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
 	defaultTopP := 0.95
 	defaultTopK := 40
--- a/core/config/serving_defaults.go
+++ b/core/config/serving_defaults.go
@@ -0,0 +1,56 @@
+package config
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/mudler/xlog"
+)
+
+// Serving-policy model-config defaults.
+//
+// Sibling to hardware_defaults.go: those fill values driven by the target
+// *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values
+// that improve multi-request / multi-user *serving* regardless of the GPU. They
+// run together from SetDefaults and only ever fill values the user left unset.
+
+// DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend
+// reuses across requests via KV-cache shifting. The llama.cpp backend ships this
+// disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system
+// prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed.
+// This is the universally-useful part of "paged attention" (cross-request prefix
+// sharing) and needs none of the block-KV machinery.
+const DefaultCacheReuse = 256
+
+// ApplyServingDefaults fills serving-policy ModelConfig values the user left
+// unset. Currently: enable cross-request prefix caching. Explicit
+// cache_reuse/n_cache_reuse in the model options always wins.
+func ApplyServingDefaults(cfg *ModelConfig) {
+	if cfg == nil {
+		return
+	}
+	if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") {
+		cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse))
+		xlog.Debug("[serving_defaults] enabling cross-request prefix cache",
+			"cache_reuse", DefaultCacheReuse)
+	}
+}
+
+// backendOptionSet reports whether the backend options already set any of names.
+// Options are "name:value" strings (or bare "name"); used so we never override
+// an explicit value. Shared with hardware_defaults.go.
+func backendOptionSet(opts []string, names ...string) bool {
+	for _, o := range opts {
+		name := o
+		if i := strings.IndexByte(o, ':'); i >= 0 {
+			name = o[:i]
+		}
+		name = strings.TrimSpace(strings.ToLower(name))
+		for _, n := range names {
+			if name == n {
+				return true
+			}
+		}
+	}
+	return false
+}
--- a/core/config/serving_defaults_test.go
+++ b/core/config/serving_defaults_test.go
@@ -0,0 +1,30 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Serving-policy config defaults", func() {
+	Describe("ApplyServingDefaults (cross-request prefix cache)", func() {
+		It("enables cache_reuse when unset", func() {
+			cfg := &ModelConfig{}
+			ApplyServingDefaults(cfg)
+			Expect(cfg.Options).To(ContainElement("cache_reuse:256"))
+		})
+		It("never overrides an explicit cache_reuse", func() {
+			cfg := &ModelConfig{Options: []string{"cache_reuse:0"}}
+			ApplyServingDefaults(cfg)
+			Expect(cfg.Options).To(Equal([]string{"cache_reuse:0"}))
+		})
+		It("recognizes the n_cache_reuse alias", func() {
+			cfg := &ModelConfig{Options: []string{"n_cache_reuse:512"}}
+			ApplyServingDefaults(cfg)
+			Expect(cfg.Options).To(Equal([]string{"n_cache_reuse:512"}))
+		})
+		It("no-ops on nil", func() {
+			Expect(func() { ApplyServingDefaults(nil) }).ToNot(Panic())
+		})
+	})
+})