mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-13 03:09:03 -04:00
When the models directory holds many GGUF files, startup parsed every model's full GGUF — including the tokenizer vocab arrays (tokenizer.ggml.tokens/scores/merges, often >100k entries) — once per model while guessing defaults. On slow storage (e.g. a models directory on a Docker volume) those hundreds of thousands of tiny reads dominate boot time before the HTTP server comes up. The default-guessing path and the VRAM metadata reader only consume scalar metadata and array lengths, never the array contents. Parse with SkipLargeMetadata (seek past large arrays) and UseMMap (fault in a few header pages instead of issuing per-element read() syscalls). For a 256k-token vocab this cuts the parse from ~524k read() syscalls to 8. The mapping is released when ParseGGUFFile returns. Fixes #9790 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Adira Denis Muhando <dennisadira@gmail.com>
61 lines
1.9 KiB
Go
61 lines
1.9 KiB
Go
package config
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
|
|
gguf "github.com/gpustack/gguf-parser-go"
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
func init() {
|
|
// Register for both explicit llama-cpp and empty backend (auto-detect from GGUF file)
|
|
RegisterBackendHook("llama-cpp", llamaCppDefaults)
|
|
RegisterBackendHook("", llamaCppDefaults)
|
|
}
|
|
|
|
func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
|
|
if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
|
|
xlog.Debug("llamaCppDefaults: guessing disabled")
|
|
return
|
|
}
|
|
if modelPath == "" {
|
|
return
|
|
}
|
|
|
|
guessPath := filepath.Join(modelPath, cfg.ModelFileName())
|
|
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
xlog.Error("llamaCppDefaults: panic while parsing gguf file")
|
|
}
|
|
}()
|
|
|
|
// Default context size if not set, regardless of whether GGUF parsing succeeds
|
|
defer func() {
|
|
if cfg.ContextSize == nil {
|
|
ctx := defaultContextSize
|
|
cfg.ContextSize = &ctx
|
|
}
|
|
}()
|
|
|
|
// Startup parses every model's GGUF header to guess defaults. We only need
|
|
// scalar metadata (architecture, head/ff counts, chat_template, token IDs,
|
|
// MTP head) plus array *lengths* — never the array *contents*. Two options
|
|
// keep this cheap, which matters when many models live on slow storage such
|
|
// as a Docker volume (see https://github.com/mudler/LocalAI/issues/9790):
|
|
//
|
|
// - SkipLargeMetadata: seek past large array-valued metadata (the tokenizer
|
|
// vocab: tokenizer.ggml.tokens/scores/merges, often >100k entries) instead
|
|
// of reading and allocating every element. Lengths stay populated.
|
|
// - UseMMap: read the header via a memory map so faulting in a few pages
|
|
// replaces hundreds of thousands of tiny read() syscalls (measured ~524k
|
|
// -> 8 for a 256k-token vocab), the dominant cost on slow filesystems.
|
|
//
|
|
// The mapping is released when ParseGGUFFile returns.
|
|
f, err := gguf.ParseGGUFFile(guessPath, gguf.UseMMap(), gguf.SkipLargeMetadata())
|
|
if err == nil {
|
|
guessGGUFFromFile(cfg, f, 0)
|
|
}
|
|
}
|