mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-08 08:46:49 -04:00
When the models directory holds many GGUF files, startup parsed every model's full GGUF — including the tokenizer vocab arrays (tokenizer.ggml.tokens/scores/merges, often >100k entries) — once per model while guessing defaults. On slow storage (e.g. a models directory on a Docker volume) those hundreds of thousands of tiny reads dominate boot time before the HTTP server comes up. The default-guessing path and the VRAM metadata reader only consume scalar metadata and array lengths, never the array contents. Parse with SkipLargeMetadata (seek past large arrays) and UseMMap (fault in a few header pages instead of issuing per-element read() syscalls). For a 256k-token vocab this cuts the parse from ~524k read() syscalls to 8. The mapping is released when ParseGGUFFile returns. Fixes #9790 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Adira Denis Muhando <dennisadira@gmail.com>
52 lines
1.4 KiB
Go
52 lines
1.4 KiB
Go
package vram
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
|
|
gguf "github.com/gpustack/gguf-parser-go"
|
|
"github.com/mudler/LocalAI/pkg/downloader"
|
|
)
|
|
|
|
type defaultGGUFReader struct{}
|
|
|
|
func (defaultGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error) {
|
|
u := downloader.URI(uri)
|
|
urlStr := u.ResolveURL()
|
|
|
|
if strings.HasPrefix(uri, downloader.LocalPrefix) {
|
|
// Only architecture scalars are read below, never the tokenizer vocab
|
|
// arrays, so skip them and memory-map the header to avoid a syscall
|
|
// storm on slow storage. Same rationale as the startup guessing path in
|
|
// core/config/hooks_llamacpp.go (https://github.com/mudler/LocalAI/issues/9790).
|
|
f, err := gguf.ParseGGUFFile(urlStr, gguf.UseMMap(), gguf.SkipLargeMetadata())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return ggufFileToMeta(f), nil
|
|
}
|
|
if !u.LooksLikeHTTPURL() {
|
|
return nil, nil
|
|
}
|
|
f, err := gguf.ParseGGUFFileRemote(ctx, urlStr)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return ggufFileToMeta(f), nil
|
|
}
|
|
|
|
func ggufFileToMeta(f *gguf.GGUFFile) *GGUFMeta {
|
|
arch := f.Architecture()
|
|
meta := &GGUFMeta{
|
|
BlockCount: uint32(arch.BlockCount),
|
|
EmbeddingLength: uint32(arch.EmbeddingLength),
|
|
HeadCount: uint32(arch.AttentionHeadCount),
|
|
HeadCountKV: uint32(arch.AttentionHeadCountKV),
|
|
MaximumContextLength: arch.MaximumContextLength,
|
|
}
|
|
if meta.HeadCountKV == 0 {
|
|
meta.HeadCountKV = meta.HeadCount
|
|
}
|
|
return meta
|
|
}
|