diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go index 2ed54265f..114785ce4 100644 --- a/core/config/hardware_defaults.go +++ b/core/config/hardware_defaults.go @@ -111,19 +111,9 @@ func EnsureParallelOption(opts []string, gpu GPU) []string { } // hasParallelOption reports whether the model already sets parallel/n_parallel -// (backend options are "name:value" strings) so we never override an explicit value. +// so we never override an explicit value (helper shared with serving_defaults.go). func hasParallelOption(opts []string) bool { - for _, o := range opts { - name := o - if i := strings.IndexByte(o, ':'); i >= 0 { - name = o[:i] - } - switch strings.TrimSpace(strings.ToLower(name)) { - case "parallel", "n_parallel": - return true - } - } - return false + return backendOptionSet(opts, "parallel", "n_parallel") } // localGPU builds a GPU descriptor from local detection, used by SetDefaults on diff --git a/core/config/model_config.go b/core/config/model_config.go index 75136ec6c..df6a24d5f 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -1116,6 +1116,10 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) { // heuristics for the selected node's GPU before loading. Explicit config wins. ApplyHardwareDefaults(cfg, localGPU()) + // Apply serving-policy defaults (device-independent): cross-request prefix + // caching. Propagates to distributed nodes via the model options. + ApplyServingDefaults(cfg) + // https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22 defaultTopP := 0.95 defaultTopK := 40 diff --git a/core/config/serving_defaults.go b/core/config/serving_defaults.go new file mode 100644 index 000000000..3b10e7000 --- /dev/null +++ b/core/config/serving_defaults.go @@ -0,0 +1,56 @@ +package config + +import ( + "fmt" + "strings" + + "github.com/mudler/xlog" +) + +// Serving-policy model-config defaults. +// +// Sibling to hardware_defaults.go: those fill values driven by the target +// *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values +// that improve multi-request / multi-user *serving* regardless of the GPU. They +// run together from SetDefaults and only ever fill values the user left unset. + +// DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend +// reuses across requests via KV-cache shifting. The llama.cpp backend ships this +// disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system +// prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed. +// This is the universally-useful part of "paged attention" (cross-request prefix +// sharing) and needs none of the block-KV machinery. +const DefaultCacheReuse = 256 + +// ApplyServingDefaults fills serving-policy ModelConfig values the user left +// unset. Currently: enable cross-request prefix caching. Explicit +// cache_reuse/n_cache_reuse in the model options always wins. +func ApplyServingDefaults(cfg *ModelConfig) { + if cfg == nil { + return + } + if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") { + cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse)) + xlog.Debug("[serving_defaults] enabling cross-request prefix cache", + "cache_reuse", DefaultCacheReuse) + } +} + +// backendOptionSet reports whether the backend options already set any of names. +// Options are "name:value" strings (or bare "name"); used so we never override +// an explicit value. Shared with hardware_defaults.go. +func backendOptionSet(opts []string, names ...string) bool { + for _, o := range opts { + name := o + if i := strings.IndexByte(o, ':'); i >= 0 { + name = o[:i] + } + name = strings.TrimSpace(strings.ToLower(name)) + for _, n := range names { + if name == n { + return true + } + } + } + return false +} diff --git a/core/config/serving_defaults_test.go b/core/config/serving_defaults_test.go new file mode 100644 index 000000000..2a5bba72a --- /dev/null +++ b/core/config/serving_defaults_test.go @@ -0,0 +1,30 @@ +package config_test + +import ( + . "github.com/mudler/LocalAI/core/config" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Serving-policy config defaults", func() { + Describe("ApplyServingDefaults (cross-request prefix cache)", func() { + It("enables cache_reuse when unset", func() { + cfg := &ModelConfig{} + ApplyServingDefaults(cfg) + Expect(cfg.Options).To(ContainElement("cache_reuse:256")) + }) + It("never overrides an explicit cache_reuse", func() { + cfg := &ModelConfig{Options: []string{"cache_reuse:0"}} + ApplyServingDefaults(cfg) + Expect(cfg.Options).To(Equal([]string{"cache_reuse:0"})) + }) + It("recognizes the n_cache_reuse alias", func() { + cfg := &ModelConfig{Options: []string{"n_cache_reuse:512"}} + ApplyServingDefaults(cfg) + Expect(cfg.Options).To(Equal([]string{"n_cache_reuse:512"})) + }) + It("no-ops on nil", func() { + Expect(func() { ApplyServingDefaults(nil) }).ToNot(Panic()) + }) + }) +})