Files
LocalAI/core/config/hooks_llamacpp.go
Adira 2c804bef5a fix(config): skip vocab arrays and mmap GGUF headers to speed up startup (#10213)
When the models directory holds many GGUF files, startup parsed every
model's full GGUF — including the tokenizer vocab arrays
(tokenizer.ggml.tokens/scores/merges, often >100k entries) — once per
model while guessing defaults. On slow storage (e.g. a models directory
on a Docker volume) those hundreds of thousands of tiny reads dominate
boot time before the HTTP server comes up.

The default-guessing path and the VRAM metadata reader only consume
scalar metadata and array lengths, never the array contents. Parse with
SkipLargeMetadata (seek past large arrays) and UseMMap (fault in a few
header pages instead of issuing per-element read() syscalls). For a
256k-token vocab this cuts the parse from ~524k read() syscalls to 8.
The mapping is released when ParseGGUFFile returns.

Fixes #9790

Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Adira Denis Muhando <dennisadira@gmail.com>
2026-06-07 23:33:52 +02:00

61 lines
1.9 KiB
Go

package config
import (
"os"
"path/filepath"
gguf "github.com/gpustack/gguf-parser-go"
"github.com/mudler/xlog"
)
func init() {
// Register for both explicit llama-cpp and empty backend (auto-detect from GGUF file)
RegisterBackendHook("llama-cpp", llamaCppDefaults)
RegisterBackendHook("", llamaCppDefaults)
}
func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
xlog.Debug("llamaCppDefaults: guessing disabled")
return
}
if modelPath == "" {
return
}
guessPath := filepath.Join(modelPath, cfg.ModelFileName())
defer func() {
if r := recover(); r != nil {
xlog.Error("llamaCppDefaults: panic while parsing gguf file")
}
}()
// Default context size if not set, regardless of whether GGUF parsing succeeds
defer func() {
if cfg.ContextSize == nil {
ctx := defaultContextSize
cfg.ContextSize = &ctx
}
}()
// Startup parses every model's GGUF header to guess defaults. We only need
// scalar metadata (architecture, head/ff counts, chat_template, token IDs,
// MTP head) plus array *lengths* — never the array *contents*. Two options
// keep this cheap, which matters when many models live on slow storage such
// as a Docker volume (see https://github.com/mudler/LocalAI/issues/9790):
//
// - SkipLargeMetadata: seek past large array-valued metadata (the tokenizer
// vocab: tokenizer.ggml.tokens/scores/merges, often >100k entries) instead
// of reading and allocating every element. Lengths stay populated.
// - UseMMap: read the header via a memory map so faulting in a few pages
// replaces hundreds of thousands of tiny read() syscalls (measured ~524k
// -> 8 for a 256k-token vocab), the dominant cost on slow filesystems.
//
// The mapping is released when ParseGGUFFile returns.
f, err := gguf.ParseGGUFFile(guessPath, gguf.UseMMap(), gguf.SkipLargeMetadata())
if err == nil {
guessGGUFFromFile(cfg, f, 0)
}
}