fix(config): skip vocab arrays and mmap GGUF headers to speed up startup (#10213)

When the models directory holds many GGUF files, startup parsed every model's full GGUF — including the tokenizer vocab arrays (tokenizer.ggml.tokens/scores/merges, often >100k entries) — once per model while guessing defaults. On slow storage (e.g. a models directory on a Docker volume) those hundreds of thousands of tiny reads dominate boot time before the HTTP server comes up. The default-guessing path and the VRAM metadata reader only consume scalar metadata and array lengths, never the array contents. Parse with SkipLargeMetadata (seek past large arrays) and UseMMap (fault in a few header pages instead of issuing per-element read() syscalls). For a 256k-token vocab this cuts the parse from ~524k read() syscalls to 8. The mapping is released when ParseGGUFFile returns. Fixes #9790 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Adira Denis Muhando <dennisadira@gmail.com>
2026-07-30 09:57:57 -04:00 · 2026-06-08 00:33:52 +03:00
parent 6070402477
commit 2c804bef5a
3 changed files with 135 additions and 2 deletions
--- a/core/config/hooks_llamacpp.go
+++ b/core/config/hooks_llamacpp.go
@@ -39,7 +39,21 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
 		}
 	}()

-	f, err := gguf.ParseGGUFFile(guessPath)
+	// Startup parses every model's GGUF header to guess defaults. We only need
+	// scalar metadata (architecture, head/ff counts, chat_template, token IDs,
+	// MTP head) plus array *lengths* — never the array *contents*. Two options
+	// keep this cheap, which matters when many models live on slow storage such
+	// as a Docker volume (see https://github.com/mudler/LocalAI/issues/9790):
+	//
+	//   - SkipLargeMetadata: seek past large array-valued metadata (the tokenizer
+	//     vocab: tokenizer.ggml.tokens/scores/merges, often >100k entries) instead
+	//     of reading and allocating every element. Lengths stay populated.
+	//   - UseMMap: read the header via a memory map so faulting in a few pages
+	//     replaces hundreds of thousands of tiny read() syscalls (measured ~524k
+	//     -> 8 for a 256k-token vocab), the dominant cost on slow filesystems.
+	//
+	// The mapping is released when ParseGGUFFile returns.
+	f, err := gguf.ParseGGUFFile(guessPath, gguf.UseMMap(), gguf.SkipLargeMetadata())
 	if err == nil {
 		guessGGUFFromFile(cfg, f, 0)
 	}
--- a/core/config/hooks_test.go
+++ b/core/config/hooks_test.go
@@ -1,13 +1,76 @@
 package config_test

 import (
+	"bytes"
+	"encoding/binary"
+	"os"
+	"path/filepath"
+
 	. "github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"

+	gguf "github.com/gpustack/gguf-parser-go"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )

+// GGUF metadata value type tags (see github.com/gpustack/gguf-parser-go).
+const (
+	ggufTypeUint32 uint32 = 4
+	ggufTypeString uint32 = 8
+	ggufTypeArray  uint32 = 9
+)
+
+// writeTestGGUF emits a minimal but valid little-endian GGUF v3 header carrying
+// the scalar metadata the llama-cpp hook guesses from plus a large string vocab
+// array (tokenizer.ggml.tokens). The big array is exactly what SkipLargeMetadata
+// + UseMMap are expected to avoid reading element-by-element, so it must survive a
+// round-trip through the real hook without corrupting the guessed defaults.
+func writeTestGGUF(path, chatTemplate string, vocab int) error {
+	wStr := func(b *bytes.Buffer, s string) {
+		binary.Write(b, binary.LittleEndian, uint64(len(s)))
+		b.WriteString(s)
+	}
+	kvStr := func(b *bytes.Buffer, k, v string) {
+		wStr(b, k)
+		binary.Write(b, binary.LittleEndian, ggufTypeString)
+		wStr(b, v)
+	}
+	kvU32 := func(b *bytes.Buffer, k string, v uint32) {
+		wStr(b, k)
+		binary.Write(b, binary.LittleEndian, ggufTypeUint32)
+		binary.Write(b, binary.LittleEndian, v)
+	}
+
+	var meta bytes.Buffer
+	kvStr(&meta, "general.architecture", "llama")
+	kvStr(&meta, "general.name", "ReproModel")
+	kvU32(&meta, "llama.context_length", 4096)
+	kvU32(&meta, "llama.attention.head_count", 32)
+	kvU32(&meta, "llama.feed_forward_length", 11008)
+	kvU32(&meta, "llama.block_count", 32)
+	kvU32(&meta, "tokenizer.ggml.bos_token_id", 1)
+	kvStr(&meta, "tokenizer.chat_template", chatTemplate)
+
+	// large array value — the one the optimization skips reading
+	wStr(&meta, "tokenizer.ggml.tokens")
+	binary.Write(&meta, binary.LittleEndian, ggufTypeArray)
+	binary.Write(&meta, binary.LittleEndian, ggufTypeString)
+	binary.Write(&meta, binary.LittleEndian, uint64(vocab))
+	for i := 0; i < vocab; i++ {
+		wStr(&meta, "token")
+	}
+
+	var out bytes.Buffer
+	binary.Write(&out, binary.LittleEndian, gguf.GGUFMagicGGUFLe)
+	binary.Write(&out, binary.LittleEndian, uint32(3)) // version
+	binary.Write(&out, binary.LittleEndian, uint64(0)) // tensor count
+	binary.Write(&out, binary.LittleEndian, uint64(9)) // metadata kv count
+	out.Write(meta.Bytes())
+
+	return os.WriteFile(path, out.Bytes(), 0o644)
+}
+
 var _ = Describe("Backend hooks and parser defaults", func() {
 	Context("MatchParserDefaults", func() {
 		It("matches Qwen3 family", func() {
@@ -137,6 +200,58 @@ var _ = Describe("Backend hooks and parser defaults", func() {
 		})
 	})

+	Context("llamaCppDefaults GGUF guessing", func() {
+		// Regression coverage for https://github.com/mudler/LocalAI/issues/9790:
+		// the hook reads GGUF headers with SkipLargeMetadata + UseMMap to avoid
+		// pulling the whole tokenizer vocab off (slow) disk on every startup. This
+		// verifies that skipping the vocab array still yields the correct guessed
+		// defaults from the remaining scalar metadata.
+		const chatTemplate = "{{ bos_token }}{% for m in messages %}{{ m.content }}{% endfor %}"
+
+		It("guesses defaults from a GGUF whose large vocab is skipped", func() {
+			dir := GinkgoT().TempDir()
+			modelFile := "repro.gguf"
+			Expect(writeTestGGUF(filepath.Join(dir, modelFile), chatTemplate, 50000)).To(Succeed())
+
+			// A pre-set context size short-circuits the GGUF run-estimate, which
+			// needs full tensor info this header-only fixture deliberately omits;
+			// the metadata-reading path the optimization touches is unaffected.
+			ctxSize := 4096
+			cfg := &ModelConfig{
+				Backend: "llama-cpp",
+				LLMConfig: LLMConfig{ContextSize: &ctxSize},
+				PredictionOptions: schema.PredictionOptions{
+					BasicModelRequest: schema.BasicModelRequest{Model: modelFile},
+				},
+			}
+			cfg.SetDefaults(ModelPath(dir))
+
+			// chat_template is a scalar string, not part of the skipped array,
+			// so it must be captured verbatim.
+			Expect(cfg.GetModelTemplate()).To(Equal(chatTemplate))
+			// scalar-derived defaults are still applied
+			Expect(cfg.ContextSize).NotTo(BeNil())
+			Expect(cfg.NGPULayers).NotTo(BeNil())
+			Expect(cfg.TemplateConfig.UseTokenizerTemplate).To(BeTrue())
+			Expect(cfg.KnownUsecaseStrings).To(ContainElement("FLAG_CHAT"))
+		})
+
+		It("falls back to the default context size when the GGUF is unreadable", func() {
+			dir := GinkgoT().TempDir()
+			Expect(os.WriteFile(filepath.Join(dir, "bad.gguf"), []byte("not a gguf"), 0o644)).To(Succeed())
+
+			cfg := &ModelConfig{
+				Backend: "llama-cpp",
+				PredictionOptions: schema.PredictionOptions{
+					BasicModelRequest: schema.BasicModelRequest{Model: "bad.gguf"},
+				},
+			}
+			cfg.SetDefaults(ModelPath(dir))
+
+			Expect(cfg.ContextSize).NotTo(BeNil())
+		})
+	})
+
 	Context("PromptCacheAll default", func() {
 		It("defaults to true when omitted from YAML", func() {
 			cfg := &ModelConfig{}
--- a/pkg/vram/gguf_reader.go
+++ b/pkg/vram/gguf_reader.go
@@ -15,7 +15,11 @@ func (defaultGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMet
 	urlStr := u.ResolveURL()

 	if strings.HasPrefix(uri, downloader.LocalPrefix) {
-		f, err := gguf.ParseGGUFFile(urlStr)
+		// Only architecture scalars are read below, never the tokenizer vocab
+		// arrays, so skip them and memory-map the header to avoid a syscall
+		// storm on slow storage. Same rationale as the startup guessing path in
+		// core/config/hooks_llamacpp.go (https://github.com/mudler/LocalAI/issues/9790).
+		f, err := gguf.ParseGGUFFile(urlStr, gguf.UseMMap(), gguf.SkipLargeMetadata())
 		if err != nil {
 			return nil, err
 		}