feat(config): default swa_full:true for sliding-window-attention models (#10611)

LocalAI enables a cross-request prompt-prefix cache (cache_reuse, see core/config/serving_defaults.go) so repeated prefixes — system prompts, RAG context, agent scaffolds, multi-turn chat — are not reprocessed every turn. For sliding-window-attention (SWA) models (Gemma 2/3, Cohere2, Llama 4, ...) this silently does nothing: llama.cpp defaults to a reduced SWA KV cache sized to the sliding window, and that reduced cache cannot preserve a prompt prefix across requests, so every turn reprocesses the whole prompt anyway. llama.cpp's --swa-full (params.swa_full, already wired through the LocalAI llama.cpp backend's `swa_full` option) keeps the full KV cache so the shared prefix is reused. Enable it automatically, but only for models that are actually SWA: detection reads the gguf-parser-normalized `<arch>.attention.sliding_window` metadata (which also applies llama.cpp's family rules, e.g. Phi-3 → not SWA), right where the GGUF is already parsed for defaults. It is never applied to dense models (pure memory waste) and never overrides an explicit user `swa_full`/`n_swa` choice. Tradeoff: the full SWA cache scales with context_size, so it costs more memory at large contexts — hence the SWA gating and the documented `swa_full:false` opt-out. Assisted-by: Claude:claude-opus-4-8 [Claude Code] golangci-lint Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-30 19:37:00 -04:00 · 2026-06-30 17:58:17 +02:00
parent fd8cebd0b3
commit 02b007a31e
4 changed files with 187 additions and 1 deletions
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -67,6 +67,16 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 		ApplyMTPDefaults(cfg, n)
 	}

+	// Sliding-window-attention models (Gemma 2/3, Cohere2, Llama 4, ...) ship
+	// with a reduced SWA KV cache by default, which cannot reuse a prompt
+	// prefix across requests and so defeats the cross-request prefix cache
+	// (cache_reuse) we enable in serving_defaults.go. Enable the full SWA cache
+	// for these models so the prefix survives; skipped for dense models and
+	// when the user already pinned an SWA cache option.
+	if w, ok := HasSlidingWindowAttention(f); ok {
+		ApplySWAFullDefault(cfg, w)
+	}
+
 	// Thinking support detection is done after model load via DetectThinkingSupportFromBackend

 	// template estimations
--- a/core/config/swa.go
+++ b/core/config/swa.go
@@ -0,0 +1,56 @@
+package config
+
+import (
+	gguf "github.com/gpustack/gguf-parser-go"
+	"github.com/mudler/xlog"
+)
+
+// swaCacheOptionNames lists the backend option keys that control the
+// sliding-window-attention KV cache. If the user pinned any of these we leave
+// the SWA cache alone instead of forcing swa_full.
+var swaCacheOptionNames = []string{"swa_full", "n_swa"}
+
+// HasSlidingWindowAttention reports whether the parsed GGUF describes a
+// sliding-window-attention (SWA) model — Gemma 2/3, Cohere2, Llama 4 and the
+// like. The gguf-parser library normalizes the per-architecture
+// `<arch>.attention.sliding_window` metadata key into
+// GGUFArchitecture.AttentionSlidingWindow, applying the same family-specific
+// rules llama.cpp uses (e.g. Phi-3 carries the key but does not actually run
+// SWA, and is normalized to 0). A non-zero window means the model interleaves
+// SWA layers, so the returned size is also the diagnostic value we log.
+func HasSlidingWindowAttention(f *gguf.GGUFFile) (uint64, bool) {
+	if f == nil {
+		return 0, false
+	}
+	w := f.Architecture().AttentionSlidingWindow
+	return w, w > 0
+}
+
+// ApplySWAFullDefault enables the full-size SWA KV cache (swa_full:true) for a
+// sliding-window model, unless the user already pinned an SWA cache option.
+//
+// Why: llama.cpp defaults to a reduced SWA KV cache sized to the sliding window
+// (memory-light), but that reduced cache cannot preserve a prompt prefix across
+// requests. So for SWA models the cross-request prefix cache we enable in
+// serving_defaults.go (cache_reuse) is silently defeated — every turn
+// reprocesses the entire prompt. Setting swa_full:true makes llama.cpp keep the
+// full KV cache so the shared prefix is actually reused.
+//
+// The tradeoff is memory: the full SWA cache scales with context_size, so this
+// is gated to models that are genuinely SWA (never applied to dense models,
+// where it would only waste memory) and never overrides an explicit user
+// choice. `slidingWindow` is the value read from the GGUF and is used only for
+// the diagnostic log line.
+func ApplySWAFullDefault(cfg *ModelConfig, slidingWindow uint64) {
+	if cfg == nil || slidingWindow == 0 {
+		return
+	}
+	if backendOptionSet(cfg.Options, swaCacheOptionNames...) {
+		xlog.Debug("[swa] sliding-window model but an SWA cache option is already set; leaving user choice intact",
+			"name", cfg.Name, "sliding_window", slidingWindow)
+		return
+	}
+	cfg.Options = append(cfg.Options, "swa_full:true")
+	xlog.Debug("[swa] enabling swa_full for sliding-window model so the cross-request prompt-prefix cache survives (reduced SWA cache cannot reuse a prefix across requests)",
+		"name", cfg.Name, "sliding_window", slidingWindow)
+}
--- a/core/config/swa_test.go
+++ b/core/config/swa_test.go
@@ -0,0 +1,120 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+
+	gguf "github.com/gpustack/gguf-parser-go"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// ggufWithSlidingWindow fabricates a minimal in-memory GGUF carrying the given
+// `general.architecture` and `<arch>.attention.sliding_window` so the SWA
+// detection can be exercised without a real model file. A window of 0 omits the
+// key, modelling a dense (non-SWA) model.
+func ggufWithSlidingWindow(arch string, window uint32) *gguf.GGUFFile {
+	kvs := gguf.GGUFMetadataKVs{
+		{
+			Key:       "general.architecture",
+			ValueType: gguf.GGUFMetadataValueTypeString,
+			Value:     arch,
+		},
+	}
+	if window > 0 {
+		kvs = append(kvs, gguf.GGUFMetadataKV{
+			Key:       arch + ".attention.sliding_window",
+			ValueType: gguf.GGUFMetadataValueTypeUint32,
+			Value:     window,
+		})
+	}
+	return &gguf.GGUFFile{
+		Header: gguf.GGUFHeader{MetadataKV: kvs},
+	}
+}
+
+var _ = Describe("SWA full-cache auto-default", func() {
+	Context("HasSlidingWindowAttention", func() {
+		It("returns false on a nil GGUF file", func() {
+			w, ok := HasSlidingWindowAttention(nil)
+			Expect(ok).To(BeFalse())
+			Expect(w).To(BeZero())
+		})
+
+		It("detects a sliding-window model (Gemma 3 style)", func() {
+			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma3", 1024))
+			Expect(ok).To(BeTrue())
+			Expect(w).To(Equal(uint64(1024)))
+		})
+
+		It("detects Gemma 2 even without an explicit key (family default window)", func() {
+			// gguf-parser applies llama.cpp's family rules: gemma2 defaults the
+			// sliding window to 4096 when the metadata key is absent.
+			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma2", 0))
+			Expect(ok).To(BeTrue())
+			Expect(w).To(Equal(uint64(4096)))
+		})
+
+		It("reports a dense model as non-SWA", func() {
+			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("llama", 0))
+			Expect(ok).To(BeFalse())
+			Expect(w).To(BeZero())
+		})
+
+		It("treats Phi-3 as non-SWA even when the key is present", func() {
+			// Phi-3 carries attention.sliding_window but does not actually run
+			// SWA; gguf-parser normalizes it to 0 to match llama.cpp.
+			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("phi3", 2048))
+			Expect(ok).To(BeFalse())
+			Expect(w).To(BeZero())
+		})
+	})
+
+	Context("ApplySWAFullDefault", func() {
+		It("enables swa_full for a sliding-window model when unset", func() {
+			cfg := &ModelConfig{Name: "gemma3"}
+			ApplySWAFullDefault(cfg, 1024)
+			Expect(cfg.Options).To(ContainElement("swa_full:true"))
+		})
+
+		It("is a no-op for a dense model (window 0)", func() {
+			cfg := &ModelConfig{Name: "llama"}
+			ApplySWAFullDefault(cfg, 0)
+			Expect(cfg.Options).To(BeEmpty())
+		})
+
+		It("preserves an explicit swa_full:false", func() {
+			cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:false"}}
+			ApplySWAFullDefault(cfg, 1024)
+			Expect(cfg.Options).To(Equal([]string{"swa_full:false"}))
+		})
+
+		It("preserves an explicit swa_full:true without duplicating it", func() {
+			cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:true"}}
+			ApplySWAFullDefault(cfg, 1024)
+			Expect(cfg.Options).To(Equal([]string{"swa_full:true"}))
+		})
+
+		It("respects the n_swa alias", func() {
+			cfg := &ModelConfig{Name: "gemma3", Options: []string{"n_swa:512"}}
+			ApplySWAFullDefault(cfg, 1024)
+			Expect(cfg.Options).To(Equal([]string{"n_swa:512"}))
+		})
+
+		It("preserves unrelated options already on the config", func() {
+			cfg := &ModelConfig{
+				Name:    "gemma3",
+				Options: []string{"use_jinja:true", "cache_reuse:256"},
+			}
+			ApplySWAFullDefault(cfg, 1024)
+			Expect(cfg.Options).To(Equal([]string{
+				"use_jinja:true",
+				"cache_reuse:256",
+				"swa_full:true",
+			}))
+		})
+
+		It("tolerates a nil config", func() {
+			Expect(func() { ApplySWAFullDefault(nil, 1024) }).ToNot(Panic())
+		})
+	})
+})
--- a/docs/content/features/text-generation.md
+++ b/docs/content/features/text-generation.md
@@ -507,7 +507,7 @@ The `llama.cpp` backend supports additional configuration options that can be sp
 | `fit_params_min_ctx` or `fit_ctx` | integer | Minimum context size that can be set by fit_params. Default: `4096`. | `fit_ctx:2048` |
 | `n_cache_reuse` or `cache_reuse` | integer | Minimum chunk size to attempt reusing from the cache via KV shifting. Default: `0` (disabled). | `cache_reuse:256` |
 | `slot_prompt_similarity` or `sps` | float | How much the prompt of a request must match the prompt of a slot to use that slot. Default: `0.1`. Set to `0` to disable. | `sps:0.5` |
-| `swa_full` | boolean | Use full-size SWA (Sliding Window Attention) cache. Default: `false`. | `swa_full:true` |
+| `swa_full` | boolean | Use full-size SWA (Sliding Window Attention) cache. Upstream default is `false` (a memory-light reduced cache), but that reduced cache cannot reuse a prompt prefix across requests, which defeats `cache_reuse` for SWA models (Gemma 2/3, Cohere2, Llama 4, ...). LocalAI therefore **auto-enables `swa_full:true` for GGUF models detected as SWA** so the cross-request prefix cache works; it is left off for dense models. The tradeoff is memory: the full SWA cache scales with `context_size`. Set `swa_full:false` explicitly to opt back out (e.g. to save memory at a large context). | `swa_full:true` |
 | `cont_batching` or `continuous_batching` | boolean | Enable continuous batching for handling multiple sequences. Default: `true`. | `cont_batching:true` |
 | `check_tensors` | boolean | Validate tensor data for invalid values during model loading. Default: `false`. | `check_tensors:true` |
 | `warmup` | boolean | Enable warmup run after model loading. Default: `true`. | `warmup:false` |