diff --git a/core/config/gguf.go b/core/config/gguf.go index b2fba6308..177e68749 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -67,16 +67,6 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { ApplyMTPDefaults(cfg, n) } - // Sliding-window-attention models (Gemma 2/3, Cohere2, Llama 4, ...) ship - // with a reduced SWA KV cache by default, which cannot reuse a prompt - // prefix across requests and so defeats the cross-request prefix cache - // (cache_reuse) we enable in serving_defaults.go. Enable the full SWA cache - // for these models so the prefix survives; skipped for dense models and - // when the user already pinned an SWA cache option. - if w, ok := HasSlidingWindowAttention(f); ok { - ApplySWAFullDefault(cfg, w) - } - // Thinking support detection is done after model load via DetectThinkingSupportFromBackend // template estimations diff --git a/core/config/swa.go b/core/config/swa.go deleted file mode 100644 index 107f71a21..000000000 --- a/core/config/swa.go +++ /dev/null @@ -1,56 +0,0 @@ -package config - -import ( - gguf "github.com/gpustack/gguf-parser-go" - "github.com/mudler/xlog" -) - -// swaCacheOptionNames lists the backend option keys that control the -// sliding-window-attention KV cache. If the user pinned any of these we leave -// the SWA cache alone instead of forcing swa_full. -var swaCacheOptionNames = []string{"swa_full", "n_swa"} - -// HasSlidingWindowAttention reports whether the parsed GGUF describes a -// sliding-window-attention (SWA) model — Gemma 2/3, Cohere2, Llama 4 and the -// like. The gguf-parser library normalizes the per-architecture -// `.attention.sliding_window` metadata key into -// GGUFArchitecture.AttentionSlidingWindow, applying the same family-specific -// rules llama.cpp uses (e.g. Phi-3 carries the key but does not actually run -// SWA, and is normalized to 0). A non-zero window means the model interleaves -// SWA layers, so the returned size is also the diagnostic value we log. -func HasSlidingWindowAttention(f *gguf.GGUFFile) (uint64, bool) { - if f == nil { - return 0, false - } - w := f.Architecture().AttentionSlidingWindow - return w, w > 0 -} - -// ApplySWAFullDefault enables the full-size SWA KV cache (swa_full:true) for a -// sliding-window model, unless the user already pinned an SWA cache option. -// -// Why: llama.cpp defaults to a reduced SWA KV cache sized to the sliding window -// (memory-light), but that reduced cache cannot preserve a prompt prefix across -// requests. So for SWA models the cross-request prefix cache we enable in -// serving_defaults.go (cache_reuse) is silently defeated — every turn -// reprocesses the entire prompt. Setting swa_full:true makes llama.cpp keep the -// full KV cache so the shared prefix is actually reused. -// -// The tradeoff is memory: the full SWA cache scales with context_size, so this -// is gated to models that are genuinely SWA (never applied to dense models, -// where it would only waste memory) and never overrides an explicit user -// choice. `slidingWindow` is the value read from the GGUF and is used only for -// the diagnostic log line. -func ApplySWAFullDefault(cfg *ModelConfig, slidingWindow uint64) { - if cfg == nil || slidingWindow == 0 { - return - } - if backendOptionSet(cfg.Options, swaCacheOptionNames...) { - xlog.Debug("[swa] sliding-window model but an SWA cache option is already set; leaving user choice intact", - "name", cfg.Name, "sliding_window", slidingWindow) - return - } - cfg.Options = append(cfg.Options, "swa_full:true") - xlog.Debug("[swa] enabling swa_full for sliding-window model so the cross-request prompt-prefix cache survives (reduced SWA cache cannot reuse a prefix across requests)", - "name", cfg.Name, "sliding_window", slidingWindow) -} diff --git a/core/config/swa_test.go b/core/config/swa_test.go deleted file mode 100644 index 260a7924c..000000000 --- a/core/config/swa_test.go +++ /dev/null @@ -1,120 +0,0 @@ -package config_test - -import ( - . "github.com/mudler/LocalAI/core/config" - - gguf "github.com/gpustack/gguf-parser-go" - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" -) - -// ggufWithSlidingWindow fabricates a minimal in-memory GGUF carrying the given -// `general.architecture` and `.attention.sliding_window` so the SWA -// detection can be exercised without a real model file. A window of 0 omits the -// key, modelling a dense (non-SWA) model. -func ggufWithSlidingWindow(arch string, window uint32) *gguf.GGUFFile { - kvs := gguf.GGUFMetadataKVs{ - { - Key: "general.architecture", - ValueType: gguf.GGUFMetadataValueTypeString, - Value: arch, - }, - } - if window > 0 { - kvs = append(kvs, gguf.GGUFMetadataKV{ - Key: arch + ".attention.sliding_window", - ValueType: gguf.GGUFMetadataValueTypeUint32, - Value: window, - }) - } - return &gguf.GGUFFile{ - Header: gguf.GGUFHeader{MetadataKV: kvs}, - } -} - -var _ = Describe("SWA full-cache auto-default", func() { - Context("HasSlidingWindowAttention", func() { - It("returns false on a nil GGUF file", func() { - w, ok := HasSlidingWindowAttention(nil) - Expect(ok).To(BeFalse()) - Expect(w).To(BeZero()) - }) - - It("detects a sliding-window model (Gemma 3 style)", func() { - w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma3", 1024)) - Expect(ok).To(BeTrue()) - Expect(w).To(Equal(uint64(1024))) - }) - - It("detects Gemma 2 even without an explicit key (family default window)", func() { - // gguf-parser applies llama.cpp's family rules: gemma2 defaults the - // sliding window to 4096 when the metadata key is absent. - w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma2", 0)) - Expect(ok).To(BeTrue()) - Expect(w).To(Equal(uint64(4096))) - }) - - It("reports a dense model as non-SWA", func() { - w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("llama", 0)) - Expect(ok).To(BeFalse()) - Expect(w).To(BeZero()) - }) - - It("treats Phi-3 as non-SWA even when the key is present", func() { - // Phi-3 carries attention.sliding_window but does not actually run - // SWA; gguf-parser normalizes it to 0 to match llama.cpp. - w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("phi3", 2048)) - Expect(ok).To(BeFalse()) - Expect(w).To(BeZero()) - }) - }) - - Context("ApplySWAFullDefault", func() { - It("enables swa_full for a sliding-window model when unset", func() { - cfg := &ModelConfig{Name: "gemma3"} - ApplySWAFullDefault(cfg, 1024) - Expect(cfg.Options).To(ContainElement("swa_full:true")) - }) - - It("is a no-op for a dense model (window 0)", func() { - cfg := &ModelConfig{Name: "llama"} - ApplySWAFullDefault(cfg, 0) - Expect(cfg.Options).To(BeEmpty()) - }) - - It("preserves an explicit swa_full:false", func() { - cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:false"}} - ApplySWAFullDefault(cfg, 1024) - Expect(cfg.Options).To(Equal([]string{"swa_full:false"})) - }) - - It("preserves an explicit swa_full:true without duplicating it", func() { - cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:true"}} - ApplySWAFullDefault(cfg, 1024) - Expect(cfg.Options).To(Equal([]string{"swa_full:true"})) - }) - - It("respects the n_swa alias", func() { - cfg := &ModelConfig{Name: "gemma3", Options: []string{"n_swa:512"}} - ApplySWAFullDefault(cfg, 1024) - Expect(cfg.Options).To(Equal([]string{"n_swa:512"})) - }) - - It("preserves unrelated options already on the config", func() { - cfg := &ModelConfig{ - Name: "gemma3", - Options: []string{"use_jinja:true", "cache_reuse:256"}, - } - ApplySWAFullDefault(cfg, 1024) - Expect(cfg.Options).To(Equal([]string{ - "use_jinja:true", - "cache_reuse:256", - "swa_full:true", - })) - }) - - It("tolerates a nil config", func() { - Expect(func() { ApplySWAFullDefault(nil, 1024) }).ToNot(Panic()) - }) - }) -}) diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md index 870684882..cadc67808 100644 --- a/docs/content/features/text-generation.md +++ b/docs/content/features/text-generation.md @@ -507,7 +507,7 @@ The `llama.cpp` backend supports additional configuration options that can be sp | `fit_params_min_ctx` or `fit_ctx` | integer | Minimum context size that can be set by fit_params. Default: `4096`. | `fit_ctx:2048` | | `n_cache_reuse` or `cache_reuse` | integer | Minimum chunk size to attempt reusing from the cache via KV shifting. Default: `0` (disabled). | `cache_reuse:256` | | `slot_prompt_similarity` or `sps` | float | How much the prompt of a request must match the prompt of a slot to use that slot. Default: `0.1`. Set to `0` to disable. | `sps:0.5` | -| `swa_full` | boolean | Use full-size SWA (Sliding Window Attention) cache. Upstream default is `false` (a memory-light reduced cache), but that reduced cache cannot reuse a prompt prefix across requests, which defeats `cache_reuse` for SWA models (Gemma 2/3, Cohere2, Llama 4, ...). LocalAI therefore **auto-enables `swa_full:true` for GGUF models detected as SWA** so the cross-request prefix cache works; it is left off for dense models. The tradeoff is memory: the full SWA cache scales with `context_size`. Set `swa_full:false` explicitly to opt back out (e.g. to save memory at a large context). | `swa_full:true` | +| `swa_full` | boolean | Use full-size SWA (Sliding Window Attention) cache. Default: `false`. | `swa_full:true` | | `cont_batching` or `continuous_batching` | boolean | Enable continuous batching for handling multiple sequences. Default: `true`. | `cont_batching:true` | | `check_tensors` | boolean | Validate tensor data for invalid values during model loading. Default: `false`. | `check_tensors:true` | | `warmup` | boolean | Enable warmup run after model loading. Default: `true`. | `warmup:false` |