mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-20 14:49:09 -04:00
feat(config): enable cross-request prefix caching for serving (Phase 2)
The llama.cpp backend ships n_cache_reuse=0 (cross-request KV prefix reuse via shifting disabled). Enable it by default (256) so repeated prefixes - system prompts, RAG context, agent scaffolds, multi-turn chat - aren't recomputed. This is the universally-useful part of 'paged attention' (shared-prefix reuse, which the upstream maintainers themselves identify as where paged attn actually helps) and needs none of the block-KV machinery. Lives in a serving_defaults.go sibling to hardware_defaults.go (device-driven vs serving-policy defaults); both run from SetDefaults and only fill unset values. Explicit cache_reuse/n_cache_reuse always wins. Device-independent, so it propagates to distributed nodes via the model options with no router change. Shares the backendOptionSet helper with the Phase-1 parallel default. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -111,19 +111,9 @@ func EnsureParallelOption(opts []string, gpu GPU) []string {
|
||||
}
|
||||
|
||||
// hasParallelOption reports whether the model already sets parallel/n_parallel
|
||||
// (backend options are "name:value" strings) so we never override an explicit value.
|
||||
// so we never override an explicit value (helper shared with serving_defaults.go).
|
||||
func hasParallelOption(opts []string) bool {
|
||||
for _, o := range opts {
|
||||
name := o
|
||||
if i := strings.IndexByte(o, ':'); i >= 0 {
|
||||
name = o[:i]
|
||||
}
|
||||
switch strings.TrimSpace(strings.ToLower(name)) {
|
||||
case "parallel", "n_parallel":
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
return backendOptionSet(opts, "parallel", "n_parallel")
|
||||
}
|
||||
|
||||
// localGPU builds a GPU descriptor from local detection, used by SetDefaults on
|
||||
|
||||
@@ -1116,6 +1116,10 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||
// heuristics for the selected node's GPU before loading. Explicit config wins.
|
||||
ApplyHardwareDefaults(cfg, localGPU())
|
||||
|
||||
// Apply serving-policy defaults (device-independent): cross-request prefix
|
||||
// caching. Propagates to distributed nodes via the model options.
|
||||
ApplyServingDefaults(cfg)
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
|
||||
defaultTopP := 0.95
|
||||
defaultTopK := 40
|
||||
|
||||
56
core/config/serving_defaults.go
Normal file
56
core/config/serving_defaults.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// Serving-policy model-config defaults.
|
||||
//
|
||||
// Sibling to hardware_defaults.go: those fill values driven by the target
|
||||
// *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values
|
||||
// that improve multi-request / multi-user *serving* regardless of the GPU. They
|
||||
// run together from SetDefaults and only ever fill values the user left unset.
|
||||
|
||||
// DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend
|
||||
// reuses across requests via KV-cache shifting. The llama.cpp backend ships this
|
||||
// disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system
|
||||
// prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed.
|
||||
// This is the universally-useful part of "paged attention" (cross-request prefix
|
||||
// sharing) and needs none of the block-KV machinery.
|
||||
const DefaultCacheReuse = 256
|
||||
|
||||
// ApplyServingDefaults fills serving-policy ModelConfig values the user left
|
||||
// unset. Currently: enable cross-request prefix caching. Explicit
|
||||
// cache_reuse/n_cache_reuse in the model options always wins.
|
||||
func ApplyServingDefaults(cfg *ModelConfig) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") {
|
||||
cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse))
|
||||
xlog.Debug("[serving_defaults] enabling cross-request prefix cache",
|
||||
"cache_reuse", DefaultCacheReuse)
|
||||
}
|
||||
}
|
||||
|
||||
// backendOptionSet reports whether the backend options already set any of names.
|
||||
// Options are "name:value" strings (or bare "name"); used so we never override
|
||||
// an explicit value. Shared with hardware_defaults.go.
|
||||
func backendOptionSet(opts []string, names ...string) bool {
|
||||
for _, o := range opts {
|
||||
name := o
|
||||
if i := strings.IndexByte(o, ':'); i >= 0 {
|
||||
name = o[:i]
|
||||
}
|
||||
name = strings.TrimSpace(strings.ToLower(name))
|
||||
for _, n := range names {
|
||||
if name == n {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
30
core/config/serving_defaults_test.go
Normal file
30
core/config/serving_defaults_test.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package config_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("Serving-policy config defaults", func() {
|
||||
Describe("ApplyServingDefaults (cross-request prefix cache)", func() {
|
||||
It("enables cache_reuse when unset", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyServingDefaults(cfg)
|
||||
Expect(cfg.Options).To(ContainElement("cache_reuse:256"))
|
||||
})
|
||||
It("never overrides an explicit cache_reuse", func() {
|
||||
cfg := &ModelConfig{Options: []string{"cache_reuse:0"}}
|
||||
ApplyServingDefaults(cfg)
|
||||
Expect(cfg.Options).To(Equal([]string{"cache_reuse:0"}))
|
||||
})
|
||||
It("recognizes the n_cache_reuse alias", func() {
|
||||
cfg := &ModelConfig{Options: []string{"n_cache_reuse:512"}}
|
||||
ApplyServingDefaults(cfg)
|
||||
Expect(cfg.Options).To(Equal([]string{"n_cache_reuse:512"}))
|
||||
})
|
||||
It("no-ops on nil", func() {
|
||||
Expect(func() { ApplyServingDefaults(nil) }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user