diff --git a/core/config/hooks_llamacpp.go b/core/config/hooks_llamacpp.go index 7c2640cee..4ced8a9b1 100644 --- a/core/config/hooks_llamacpp.go +++ b/core/config/hooks_llamacpp.go @@ -39,7 +39,21 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) { } }() - f, err := gguf.ParseGGUFFile(guessPath) + // Startup parses every model's GGUF header to guess defaults. We only need + // scalar metadata (architecture, head/ff counts, chat_template, token IDs, + // MTP head) plus array *lengths* — never the array *contents*. Two options + // keep this cheap, which matters when many models live on slow storage such + // as a Docker volume (see https://github.com/mudler/LocalAI/issues/9790): + // + // - SkipLargeMetadata: seek past large array-valued metadata (the tokenizer + // vocab: tokenizer.ggml.tokens/scores/merges, often >100k entries) instead + // of reading and allocating every element. Lengths stay populated. + // - UseMMap: read the header via a memory map so faulting in a few pages + // replaces hundreds of thousands of tiny read() syscalls (measured ~524k + // -> 8 for a 256k-token vocab), the dominant cost on slow filesystems. + // + // The mapping is released when ParseGGUFFile returns. + f, err := gguf.ParseGGUFFile(guessPath, gguf.UseMMap(), gguf.SkipLargeMetadata()) if err == nil { guessGGUFFromFile(cfg, f, 0) } diff --git a/core/config/hooks_test.go b/core/config/hooks_test.go index 94f4ac2e2..6e18ad7cc 100644 --- a/core/config/hooks_test.go +++ b/core/config/hooks_test.go @@ -1,13 +1,76 @@ package config_test import ( + "bytes" + "encoding/binary" + "os" + "path/filepath" + . "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/schema" + gguf "github.com/gpustack/gguf-parser-go" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) +// GGUF metadata value type tags (see github.com/gpustack/gguf-parser-go). +const ( + ggufTypeUint32 uint32 = 4 + ggufTypeString uint32 = 8 + ggufTypeArray uint32 = 9 +) + +// writeTestGGUF emits a minimal but valid little-endian GGUF v3 header carrying +// the scalar metadata the llama-cpp hook guesses from plus a large string vocab +// array (tokenizer.ggml.tokens). The big array is exactly what SkipLargeMetadata +// + UseMMap are expected to avoid reading element-by-element, so it must survive a +// round-trip through the real hook without corrupting the guessed defaults. +func writeTestGGUF(path, chatTemplate string, vocab int) error { + wStr := func(b *bytes.Buffer, s string) { + binary.Write(b, binary.LittleEndian, uint64(len(s))) + b.WriteString(s) + } + kvStr := func(b *bytes.Buffer, k, v string) { + wStr(b, k) + binary.Write(b, binary.LittleEndian, ggufTypeString) + wStr(b, v) + } + kvU32 := func(b *bytes.Buffer, k string, v uint32) { + wStr(b, k) + binary.Write(b, binary.LittleEndian, ggufTypeUint32) + binary.Write(b, binary.LittleEndian, v) + } + + var meta bytes.Buffer + kvStr(&meta, "general.architecture", "llama") + kvStr(&meta, "general.name", "ReproModel") + kvU32(&meta, "llama.context_length", 4096) + kvU32(&meta, "llama.attention.head_count", 32) + kvU32(&meta, "llama.feed_forward_length", 11008) + kvU32(&meta, "llama.block_count", 32) + kvU32(&meta, "tokenizer.ggml.bos_token_id", 1) + kvStr(&meta, "tokenizer.chat_template", chatTemplate) + + // large array value — the one the optimization skips reading + wStr(&meta, "tokenizer.ggml.tokens") + binary.Write(&meta, binary.LittleEndian, ggufTypeArray) + binary.Write(&meta, binary.LittleEndian, ggufTypeString) + binary.Write(&meta, binary.LittleEndian, uint64(vocab)) + for i := 0; i < vocab; i++ { + wStr(&meta, "token") + } + + var out bytes.Buffer + binary.Write(&out, binary.LittleEndian, gguf.GGUFMagicGGUFLe) + binary.Write(&out, binary.LittleEndian, uint32(3)) // version + binary.Write(&out, binary.LittleEndian, uint64(0)) // tensor count + binary.Write(&out, binary.LittleEndian, uint64(9)) // metadata kv count + out.Write(meta.Bytes()) + + return os.WriteFile(path, out.Bytes(), 0o644) +} + var _ = Describe("Backend hooks and parser defaults", func() { Context("MatchParserDefaults", func() { It("matches Qwen3 family", func() { @@ -137,6 +200,58 @@ var _ = Describe("Backend hooks and parser defaults", func() { }) }) + Context("llamaCppDefaults GGUF guessing", func() { + // Regression coverage for https://github.com/mudler/LocalAI/issues/9790: + // the hook reads GGUF headers with SkipLargeMetadata + UseMMap to avoid + // pulling the whole tokenizer vocab off (slow) disk on every startup. This + // verifies that skipping the vocab array still yields the correct guessed + // defaults from the remaining scalar metadata. + const chatTemplate = "{{ bos_token }}{% for m in messages %}{{ m.content }}{% endfor %}" + + It("guesses defaults from a GGUF whose large vocab is skipped", func() { + dir := GinkgoT().TempDir() + modelFile := "repro.gguf" + Expect(writeTestGGUF(filepath.Join(dir, modelFile), chatTemplate, 50000)).To(Succeed()) + + // A pre-set context size short-circuits the GGUF run-estimate, which + // needs full tensor info this header-only fixture deliberately omits; + // the metadata-reading path the optimization touches is unaffected. + ctxSize := 4096 + cfg := &ModelConfig{ + Backend: "llama-cpp", + LLMConfig: LLMConfig{ContextSize: &ctxSize}, + PredictionOptions: schema.PredictionOptions{ + BasicModelRequest: schema.BasicModelRequest{Model: modelFile}, + }, + } + cfg.SetDefaults(ModelPath(dir)) + + // chat_template is a scalar string, not part of the skipped array, + // so it must be captured verbatim. + Expect(cfg.GetModelTemplate()).To(Equal(chatTemplate)) + // scalar-derived defaults are still applied + Expect(cfg.ContextSize).NotTo(BeNil()) + Expect(cfg.NGPULayers).NotTo(BeNil()) + Expect(cfg.TemplateConfig.UseTokenizerTemplate).To(BeTrue()) + Expect(cfg.KnownUsecaseStrings).To(ContainElement("FLAG_CHAT")) + }) + + It("falls back to the default context size when the GGUF is unreadable", func() { + dir := GinkgoT().TempDir() + Expect(os.WriteFile(filepath.Join(dir, "bad.gguf"), []byte("not a gguf"), 0o644)).To(Succeed()) + + cfg := &ModelConfig{ + Backend: "llama-cpp", + PredictionOptions: schema.PredictionOptions{ + BasicModelRequest: schema.BasicModelRequest{Model: "bad.gguf"}, + }, + } + cfg.SetDefaults(ModelPath(dir)) + + Expect(cfg.ContextSize).NotTo(BeNil()) + }) + }) + Context("PromptCacheAll default", func() { It("defaults to true when omitted from YAML", func() { cfg := &ModelConfig{} diff --git a/pkg/vram/gguf_reader.go b/pkg/vram/gguf_reader.go index 860ab39b7..f2842ba83 100644 --- a/pkg/vram/gguf_reader.go +++ b/pkg/vram/gguf_reader.go @@ -15,7 +15,11 @@ func (defaultGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMet urlStr := u.ResolveURL() if strings.HasPrefix(uri, downloader.LocalPrefix) { - f, err := gguf.ParseGGUFFile(urlStr) + // Only architecture scalars are read below, never the tokenizer vocab + // arrays, so skip them and memory-map the header to avoid a syscall + // storm on slow storage. Same rationale as the startup guessing path in + // core/config/hooks_llamacpp.go (https://github.com/mudler/LocalAI/issues/9790). + f, err := gguf.ParseGGUFFile(urlStr, gguf.UseMMap(), gguf.SkipLargeMetadata()) if err != nil { return nil, err }