mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-22 07:39:02 -04:00
refactor(config): single source of truth for default values across config + backend Defaults were decided in two areas with duplicated/drifted literals: the config SetDefaults tiers vs core/backend/options.go's grpcModelOpts (which translates a ModelConfig to the backend wire format and supplied its own fallbacks). They had drifted - n_gpu_layers 9999999 (options.go) vs 99999999 (gguf.go), two 512 batch constants, context 1024 (gguf) vs 4096 (backend) scattered as bare literals. Introduce core/config/defaults.go as the canonical home (DefaultContextSize=4096, GGUFFallbackContextSize=1024, DefaultNGPULayers=99999999, DefaultFlashAttention= auto). gguf.go / hooks_llamacpp.go use them directly; core/backend references them (backend imports config, never the reverse) so DefaultContextSize/DefaultBatchSize and the flash-attn / n_gpu_layers fallbacks resolve to one place. The two context values (1024 GGUF-no-estimate vs 4096 general) are kept distinct but now named + documented, not blind literals. Behavior-preserving; config + backend suites green. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
61 lines
1.9 KiB
Go
61 lines
1.9 KiB
Go
package config
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
|
|
gguf "github.com/gpustack/gguf-parser-go"
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
func init() {
|
|
// Register for both explicit llama-cpp and empty backend (auto-detect from GGUF file)
|
|
RegisterBackendHook("llama-cpp", llamaCppDefaults)
|
|
RegisterBackendHook("", llamaCppDefaults)
|
|
}
|
|
|
|
func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
|
|
if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
|
|
xlog.Debug("llamaCppDefaults: guessing disabled")
|
|
return
|
|
}
|
|
if modelPath == "" {
|
|
return
|
|
}
|
|
|
|
guessPath := filepath.Join(modelPath, cfg.ModelFileName())
|
|
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
xlog.Error("llamaCppDefaults: panic while parsing gguf file")
|
|
}
|
|
}()
|
|
|
|
// Default context size if not set, regardless of whether GGUF parsing succeeds
|
|
defer func() {
|
|
if cfg.ContextSize == nil {
|
|
ctx := GGUFFallbackContextSize
|
|
cfg.ContextSize = &ctx
|
|
}
|
|
}()
|
|
|
|
// Startup parses every model's GGUF header to guess defaults. We only need
|
|
// scalar metadata (architecture, head/ff counts, chat_template, token IDs,
|
|
// MTP head) plus array *lengths* — never the array *contents*. Two options
|
|
// keep this cheap, which matters when many models live on slow storage such
|
|
// as a Docker volume (see https://github.com/mudler/LocalAI/issues/9790):
|
|
//
|
|
// - SkipLargeMetadata: seek past large array-valued metadata (the tokenizer
|
|
// vocab: tokenizer.ggml.tokens/scores/merges, often >100k entries) instead
|
|
// of reading and allocating every element. Lengths stay populated.
|
|
// - UseMMap: read the header via a memory map so faulting in a few pages
|
|
// replaces hundreds of thousands of tiny read() syscalls (measured ~524k
|
|
// -> 8 for a 256k-token vocab), the dominant cost on slow filesystems.
|
|
//
|
|
// The mapping is released when ParseGGUFFile returns.
|
|
f, err := gguf.ParseGGUFFile(guessPath, gguf.UseMMap(), gguf.SkipLargeMetadata())
|
|
if err == nil {
|
|
guessGGUFFromFile(cfg, f, 0)
|
|
}
|
|
}
|