diff --git a/core/config/generic_defaults.go b/core/config/generic_defaults.go new file mode 100644 index 000000000..57cfba514 --- /dev/null +++ b/core/config/generic_defaults.go @@ -0,0 +1,115 @@ +package config + +import "os" + +// ApplyGenericDefaults fills the generic fallback values applied after the +// higher-priority tiers (ApplyInferenceDefaults for the model family, +// ApplyHardwareDefaults for the device, ApplyServingDefaults for serving +// policy): sampling parameters and a few runtime flags. Like the other tiers it +// only fills values still left unset, so model-family / explicit config wins. +func ApplyGenericDefaults(cfg *ModelConfig) { + if cfg == nil { + return + } + + // https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22 + defaultTopP := 0.95 + defaultTopK := 40 + defaultMinP := 0.0 + defaultTemp := 0.9 + // https://github.com/mudler/LocalAI/issues/2780 + defaultMirostat := 0 + defaultMirostatTAU := 5.0 + defaultMirostatETA := 0.1 + defaultTypicalP := 1.0 + defaultTFZ := 1.0 + defaultZero := 0 + + trueV := true + falseV := false + + if cfg.Seed == nil { + // random number generator seed + defaultSeed := RAND_SEED + cfg.Seed = &defaultSeed + } + + // top_k=40 is llama.cpp's sampling default and is wrong for backends whose + // native default differs (issue #6632). Only inject it for the llama.cpp + // family and the empty/auto backend; leave TopK nil for known non-llama + // backends (e.g. mlx, whose intended default is top_k=0) so the wire value + // is 0 rather than a silently-changed 40. + if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) { + cfg.TopK = &defaultTopK + } + + if cfg.MinP == nil { + cfg.MinP = &defaultMinP + } + + if cfg.TypicalP == nil { + cfg.TypicalP = &defaultTypicalP + } + + if cfg.TFZ == nil { + cfg.TFZ = &defaultTFZ + } + + if cfg.MMap == nil { + // MMap is enabled by default + + // Only exception is for Intel GPUs + if os.Getenv("XPU") != "" { + cfg.MMap = &falseV + } else { + cfg.MMap = &trueV + } + } + + if cfg.MMlock == nil { + // MMlock is disabled by default + cfg.MMlock = &falseV + } + + if cfg.TopP == nil { + cfg.TopP = &defaultTopP + } + if cfg.Temperature == nil { + cfg.Temperature = &defaultTemp + } + + if cfg.Maxtokens == nil { + cfg.Maxtokens = &defaultZero + } + + if cfg.Mirostat == nil { + cfg.Mirostat = &defaultMirostat + } + + if cfg.MirostatETA == nil { + cfg.MirostatETA = &defaultMirostatETA + } + + if cfg.MirostatTAU == nil { + cfg.MirostatTAU = &defaultMirostatTAU + } + + if cfg.LowVRAM == nil { + cfg.LowVRAM = &falseV + } + + if cfg.Embeddings == nil { + cfg.Embeddings = &falseV + } + + if cfg.Reranking == nil { + cfg.Reranking = &falseV + } + + if cfg.PromptCacheAll == nil { + // Match upstream llama.cpp's default (common/common.h: cache_prompt = true) + // and let cache_idle_slots / kv_unified actually do useful work; users can + // opt out with an explicit `prompt_cache_all: false` in the model YAML. + cfg.PromptCacheAll = &trueV + } +} diff --git a/core/config/generic_defaults_test.go b/core/config/generic_defaults_test.go new file mode 100644 index 000000000..7cb080c0b --- /dev/null +++ b/core/config/generic_defaults_test.go @@ -0,0 +1,36 @@ +package config_test + +import ( + . "github.com/mudler/LocalAI/core/config" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("ApplyGenericDefaults (generic fallback tier)", func() { + It("fills sampling + runtime fallbacks when unset", func() { + cfg := &ModelConfig{} // empty backend uses the llama sampler defaults + ApplyGenericDefaults(cfg) + Expect(cfg.TopP).ToNot(BeNil()) + Expect(*cfg.TopP).To(Equal(0.95)) + Expect(*cfg.TopK).To(Equal(40)) + Expect(*cfg.Temperature).To(Equal(0.9)) + Expect(*cfg.MMap).To(BeTrue()) + Expect(*cfg.MMlock).To(BeFalse()) + Expect(*cfg.PromptCacheAll).To(BeTrue()) + }) + + It("never overrides explicit values", func() { + tk := 7 + tp := 0.5 + cfg := &ModelConfig{} + cfg.TopK = &tk + cfg.TopP = &tp + ApplyGenericDefaults(cfg) + Expect(*cfg.TopK).To(Equal(7)) + Expect(*cfg.TopP).To(Equal(0.5)) + }) + + It("no-ops on nil", func() { + Expect(func() { ApplyGenericDefaults(nil) }).ToNot(Panic()) + }) +}) diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go index 2ed54265f..114785ce4 100644 --- a/core/config/hardware_defaults.go +++ b/core/config/hardware_defaults.go @@ -111,19 +111,9 @@ func EnsureParallelOption(opts []string, gpu GPU) []string { } // hasParallelOption reports whether the model already sets parallel/n_parallel -// (backend options are "name:value" strings) so we never override an explicit value. +// so we never override an explicit value (helper shared with serving_defaults.go). func hasParallelOption(opts []string) bool { - for _, o := range opts { - name := o - if i := strings.IndexByte(o, ':'); i >= 0 { - name = o[:i] - } - switch strings.TrimSpace(strings.ToLower(name)) { - case "parallel", "n_parallel": - return true - } - } - return false + return backendOptionSet(opts, "parallel", "n_parallel") } // localGPU builds a GPU descriptor from local detection, used by SetDefaults on diff --git a/core/config/model_config.go b/core/config/model_config.go index 50836b99e..9586beea3 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -1126,107 +1126,17 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) { // heuristics for the selected node's GPU before loading. Explicit config wins. ApplyHardwareDefaults(cfg, localGPU()) - // https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22 - defaultTopP := 0.95 - defaultTopK := 40 - defaultMinP := 0.0 - defaultTemp := 0.9 - // https://github.com/mudler/LocalAI/issues/2780 - defaultMirostat := 0 - defaultMirostatTAU := 5.0 - defaultMirostatETA := 0.1 - defaultTypicalP := 1.0 - defaultTFZ := 1.0 - defaultZero := 0 + // Apply serving-policy defaults (device-independent): cross-request prefix + // caching. Propagates to distributed nodes via the model options. + ApplyServingDefaults(cfg) + + // Generic fallback defaults (sampling params + runtime flags), applied after + // the model-family / hardware / serving tiers above. Only fills unset values. + ApplyGenericDefaults(cfg) trueV := true falseV := false - if cfg.Seed == nil { - // random number generator seed - defaultSeed := RAND_SEED - cfg.Seed = &defaultSeed - } - - // top_k=40 is llama.cpp's sampling default and is wrong for backends whose - // native default differs (issue #6632). Only inject it for the llama.cpp - // family and the empty/auto backend; leave TopK nil for known non-llama - // backends (e.g. mlx, whose intended default is top_k=0) so the wire value - // is 0 rather than a silently-changed 40. - if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) { - cfg.TopK = &defaultTopK - } - - if cfg.MinP == nil { - cfg.MinP = &defaultMinP - } - - if cfg.TypicalP == nil { - cfg.TypicalP = &defaultTypicalP - } - - if cfg.TFZ == nil { - cfg.TFZ = &defaultTFZ - } - - if cfg.MMap == nil { - // MMap is enabled by default - - // Only exception is for Intel GPUs - if os.Getenv("XPU") != "" { - cfg.MMap = &falseV - } else { - cfg.MMap = &trueV - } - } - - if cfg.MMlock == nil { - // MMlock is disabled by default - cfg.MMlock = &falseV - } - - if cfg.TopP == nil { - cfg.TopP = &defaultTopP - } - if cfg.Temperature == nil { - cfg.Temperature = &defaultTemp - } - - if cfg.Maxtokens == nil { - cfg.Maxtokens = &defaultZero - } - - if cfg.Mirostat == nil { - cfg.Mirostat = &defaultMirostat - } - - if cfg.MirostatETA == nil { - cfg.MirostatETA = &defaultMirostatETA - } - - if cfg.MirostatTAU == nil { - cfg.MirostatTAU = &defaultMirostatTAU - } - - if cfg.LowVRAM == nil { - cfg.LowVRAM = &falseV - } - - if cfg.Embeddings == nil { - cfg.Embeddings = &falseV - } - - if cfg.Reranking == nil { - cfg.Reranking = &falseV - } - - if cfg.PromptCacheAll == nil { - // Match upstream llama.cpp's default (common/common.h: cache_prompt = true) - // and let cache_idle_slots / kv_unified actually do useful work; users can - // opt out with an explicit `prompt_cache_all: false` in the model YAML. - cfg.PromptCacheAll = &trueV - } - if threads == 0 { // Threads can't be 0 threads = 4 diff --git a/core/config/serving_defaults.go b/core/config/serving_defaults.go new file mode 100644 index 000000000..3b10e7000 --- /dev/null +++ b/core/config/serving_defaults.go @@ -0,0 +1,56 @@ +package config + +import ( + "fmt" + "strings" + + "github.com/mudler/xlog" +) + +// Serving-policy model-config defaults. +// +// Sibling to hardware_defaults.go: those fill values driven by the target +// *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values +// that improve multi-request / multi-user *serving* regardless of the GPU. They +// run together from SetDefaults and only ever fill values the user left unset. + +// DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend +// reuses across requests via KV-cache shifting. The llama.cpp backend ships this +// disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system +// prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed. +// This is the universally-useful part of "paged attention" (cross-request prefix +// sharing) and needs none of the block-KV machinery. +const DefaultCacheReuse = 256 + +// ApplyServingDefaults fills serving-policy ModelConfig values the user left +// unset. Currently: enable cross-request prefix caching. Explicit +// cache_reuse/n_cache_reuse in the model options always wins. +func ApplyServingDefaults(cfg *ModelConfig) { + if cfg == nil { + return + } + if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") { + cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse)) + xlog.Debug("[serving_defaults] enabling cross-request prefix cache", + "cache_reuse", DefaultCacheReuse) + } +} + +// backendOptionSet reports whether the backend options already set any of names. +// Options are "name:value" strings (or bare "name"); used so we never override +// an explicit value. Shared with hardware_defaults.go. +func backendOptionSet(opts []string, names ...string) bool { + for _, o := range opts { + name := o + if i := strings.IndexByte(o, ':'); i >= 0 { + name = o[:i] + } + name = strings.TrimSpace(strings.ToLower(name)) + for _, n := range names { + if name == n { + return true + } + } + } + return false +} diff --git a/core/config/serving_defaults_test.go b/core/config/serving_defaults_test.go new file mode 100644 index 000000000..2a5bba72a --- /dev/null +++ b/core/config/serving_defaults_test.go @@ -0,0 +1,30 @@ +package config_test + +import ( + . "github.com/mudler/LocalAI/core/config" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Serving-policy config defaults", func() { + Describe("ApplyServingDefaults (cross-request prefix cache)", func() { + It("enables cache_reuse when unset", func() { + cfg := &ModelConfig{} + ApplyServingDefaults(cfg) + Expect(cfg.Options).To(ContainElement("cache_reuse:256")) + }) + It("never overrides an explicit cache_reuse", func() { + cfg := &ModelConfig{Options: []string{"cache_reuse:0"}} + ApplyServingDefaults(cfg) + Expect(cfg.Options).To(Equal([]string{"cache_reuse:0"})) + }) + It("recognizes the n_cache_reuse alias", func() { + cfg := &ModelConfig{Options: []string{"n_cache_reuse:512"}} + ApplyServingDefaults(cfg) + Expect(cfg.Options).To(Equal([]string{"n_cache_reuse:512"})) + }) + It("no-ops on nil", func() { + Expect(func() { ApplyServingDefaults(nil) }).ToNot(Panic()) + }) + }) +})