mirror of
https://github.com/mudler/LocalAI.git
synced 2026-07-01 03:46:41 -04:00
LocalAI enables a cross-request prompt-prefix cache (cache_reuse, see core/config/serving_defaults.go) so repeated prefixes — system prompts, RAG context, agent scaffolds, multi-turn chat — are not reprocessed every turn. For sliding-window-attention (SWA) models (Gemma 2/3, Cohere2, Llama 4, ...) this silently does nothing: llama.cpp defaults to a reduced SWA KV cache sized to the sliding window, and that reduced cache cannot preserve a prompt prefix across requests, so every turn reprocesses the whole prompt anyway. llama.cpp's --swa-full (params.swa_full, already wired through the LocalAI llama.cpp backend's `swa_full` option) keeps the full KV cache so the shared prefix is reused. Enable it automatically, but only for models that are actually SWA: detection reads the gguf-parser-normalized `<arch>.attention.sliding_window` metadata (which also applies llama.cpp's family rules, e.g. Phi-3 → not SWA), right where the GGUF is already parsed for defaults. It is never applied to dense models (pure memory waste) and never overrides an explicit user `swa_full`/`n_swa` choice. Tradeoff: the full SWA cache scales with context_size, so it costs more memory at large contexts — hence the SWA gating and the documented `swa_full:false` opt-out. Assisted-by: Claude:claude-opus-4-8 [Claude Code] golangci-lint Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
121 lines
3.8 KiB
Go
121 lines
3.8 KiB
Go
package config_test
|
|
|
|
import (
|
|
. "github.com/mudler/LocalAI/core/config"
|
|
|
|
gguf "github.com/gpustack/gguf-parser-go"
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
// ggufWithSlidingWindow fabricates a minimal in-memory GGUF carrying the given
|
|
// `general.architecture` and `<arch>.attention.sliding_window` so the SWA
|
|
// detection can be exercised without a real model file. A window of 0 omits the
|
|
// key, modelling a dense (non-SWA) model.
|
|
func ggufWithSlidingWindow(arch string, window uint32) *gguf.GGUFFile {
|
|
kvs := gguf.GGUFMetadataKVs{
|
|
{
|
|
Key: "general.architecture",
|
|
ValueType: gguf.GGUFMetadataValueTypeString,
|
|
Value: arch,
|
|
},
|
|
}
|
|
if window > 0 {
|
|
kvs = append(kvs, gguf.GGUFMetadataKV{
|
|
Key: arch + ".attention.sliding_window",
|
|
ValueType: gguf.GGUFMetadataValueTypeUint32,
|
|
Value: window,
|
|
})
|
|
}
|
|
return &gguf.GGUFFile{
|
|
Header: gguf.GGUFHeader{MetadataKV: kvs},
|
|
}
|
|
}
|
|
|
|
var _ = Describe("SWA full-cache auto-default", func() {
|
|
Context("HasSlidingWindowAttention", func() {
|
|
It("returns false on a nil GGUF file", func() {
|
|
w, ok := HasSlidingWindowAttention(nil)
|
|
Expect(ok).To(BeFalse())
|
|
Expect(w).To(BeZero())
|
|
})
|
|
|
|
It("detects a sliding-window model (Gemma 3 style)", func() {
|
|
w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma3", 1024))
|
|
Expect(ok).To(BeTrue())
|
|
Expect(w).To(Equal(uint64(1024)))
|
|
})
|
|
|
|
It("detects Gemma 2 even without an explicit key (family default window)", func() {
|
|
// gguf-parser applies llama.cpp's family rules: gemma2 defaults the
|
|
// sliding window to 4096 when the metadata key is absent.
|
|
w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma2", 0))
|
|
Expect(ok).To(BeTrue())
|
|
Expect(w).To(Equal(uint64(4096)))
|
|
})
|
|
|
|
It("reports a dense model as non-SWA", func() {
|
|
w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("llama", 0))
|
|
Expect(ok).To(BeFalse())
|
|
Expect(w).To(BeZero())
|
|
})
|
|
|
|
It("treats Phi-3 as non-SWA even when the key is present", func() {
|
|
// Phi-3 carries attention.sliding_window but does not actually run
|
|
// SWA; gguf-parser normalizes it to 0 to match llama.cpp.
|
|
w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("phi3", 2048))
|
|
Expect(ok).To(BeFalse())
|
|
Expect(w).To(BeZero())
|
|
})
|
|
})
|
|
|
|
Context("ApplySWAFullDefault", func() {
|
|
It("enables swa_full for a sliding-window model when unset", func() {
|
|
cfg := &ModelConfig{Name: "gemma3"}
|
|
ApplySWAFullDefault(cfg, 1024)
|
|
Expect(cfg.Options).To(ContainElement("swa_full:true"))
|
|
})
|
|
|
|
It("is a no-op for a dense model (window 0)", func() {
|
|
cfg := &ModelConfig{Name: "llama"}
|
|
ApplySWAFullDefault(cfg, 0)
|
|
Expect(cfg.Options).To(BeEmpty())
|
|
})
|
|
|
|
It("preserves an explicit swa_full:false", func() {
|
|
cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:false"}}
|
|
ApplySWAFullDefault(cfg, 1024)
|
|
Expect(cfg.Options).To(Equal([]string{"swa_full:false"}))
|
|
})
|
|
|
|
It("preserves an explicit swa_full:true without duplicating it", func() {
|
|
cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:true"}}
|
|
ApplySWAFullDefault(cfg, 1024)
|
|
Expect(cfg.Options).To(Equal([]string{"swa_full:true"}))
|
|
})
|
|
|
|
It("respects the n_swa alias", func() {
|
|
cfg := &ModelConfig{Name: "gemma3", Options: []string{"n_swa:512"}}
|
|
ApplySWAFullDefault(cfg, 1024)
|
|
Expect(cfg.Options).To(Equal([]string{"n_swa:512"}))
|
|
})
|
|
|
|
It("preserves unrelated options already on the config", func() {
|
|
cfg := &ModelConfig{
|
|
Name: "gemma3",
|
|
Options: []string{"use_jinja:true", "cache_reuse:256"},
|
|
}
|
|
ApplySWAFullDefault(cfg, 1024)
|
|
Expect(cfg.Options).To(Equal([]string{
|
|
"use_jinja:true",
|
|
"cache_reuse:256",
|
|
"swa_full:true",
|
|
}))
|
|
})
|
|
|
|
It("tolerates a nil config", func() {
|
|
Expect(func() { ApplySWAFullDefault(nil, 1024) }).ToNot(Panic())
|
|
})
|
|
})
|
|
})
|