Files
LocalAI/pkg/vram/gguf_reader.go
Adira 2c804bef5a fix(config): skip vocab arrays and mmap GGUF headers to speed up startup (#10213)
When the models directory holds many GGUF files, startup parsed every
model's full GGUF — including the tokenizer vocab arrays
(tokenizer.ggml.tokens/scores/merges, often >100k entries) — once per
model while guessing defaults. On slow storage (e.g. a models directory
on a Docker volume) those hundreds of thousands of tiny reads dominate
boot time before the HTTP server comes up.

The default-guessing path and the VRAM metadata reader only consume
scalar metadata and array lengths, never the array contents. Parse with
SkipLargeMetadata (seek past large arrays) and UseMMap (fault in a few
header pages instead of issuing per-element read() syscalls). For a
256k-token vocab this cuts the parse from ~524k read() syscalls to 8.
The mapping is released when ParseGGUFFile returns.

Fixes #9790

Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Adira Denis Muhando <dennisadira@gmail.com>
2026-06-07 23:33:52 +02:00

52 lines
1.4 KiB
Go

package vram
import (
"context"
"strings"
gguf "github.com/gpustack/gguf-parser-go"
"github.com/mudler/LocalAI/pkg/downloader"
)
type defaultGGUFReader struct{}
func (defaultGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error) {
u := downloader.URI(uri)
urlStr := u.ResolveURL()
if strings.HasPrefix(uri, downloader.LocalPrefix) {
// Only architecture scalars are read below, never the tokenizer vocab
// arrays, so skip them and memory-map the header to avoid a syscall
// storm on slow storage. Same rationale as the startup guessing path in
// core/config/hooks_llamacpp.go (https://github.com/mudler/LocalAI/issues/9790).
f, err := gguf.ParseGGUFFile(urlStr, gguf.UseMMap(), gguf.SkipLargeMetadata())
if err != nil {
return nil, err
}
return ggufFileToMeta(f), nil
}
if !u.LooksLikeHTTPURL() {
return nil, nil
}
f, err := gguf.ParseGGUFFileRemote(ctx, urlStr)
if err != nil {
return nil, err
}
return ggufFileToMeta(f), nil
}
func ggufFileToMeta(f *gguf.GGUFFile) *GGUFMeta {
arch := f.Architecture()
meta := &GGUFMeta{
BlockCount: uint32(arch.BlockCount),
EmbeddingLength: uint32(arch.EmbeddingLength),
HeadCount: uint32(arch.AttentionHeadCount),
HeadCountKV: uint32(arch.AttentionHeadCountKV),
MaximumContextLength: arch.MaximumContextLength,
}
if meta.HeadCountKV == 0 {
meta.HeadCountKV = meta.HeadCount
}
return meta
}