From 1154be5eea3d624be6e64d3f491885fabe8b6845 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 27 Jun 2026 23:34:52 +0200 Subject: [PATCH] fix(config): fall back to DefaultContextSize for unparseable GGUFs; pin NVFP4 gallery context_size (#10563) The GGUF metadata parser (gpustack/gguf-parser-go) cannot read NVFP4-quantized GGUFs at all: it errors with "read tensor info 0: This quantized type is currently unsupported" because NVFP4 is a ggml tensor type it does not know. When ParseGGUFFile errors, the llama-cpp defaults hook skips guessGGUFFromFile entirely and the deferred fallback sets the context window to the conservative GGUFFallbackContextSize (1024). The result: a model that trains to 262144 tokens runs with n_ctx=1024, and every prompt over ~1k tokens fails with "request (N tokens) exceeds the available context size (1024 tokens)". Two changes: - Drop GGUFFallbackContextSize (1024) and fall back to DefaultContextSize (4096) in both the GGUF run-estimate path (gguf.go) and the deferred hook fallback (hooks_llamacpp.go). 1024 is a sensible floor for a tiny CPU GGUF but a footgun for a large, long-context model whose header simply cannot be parsed. Strengthen the existing "GGUF unreadable" test to assert the value. - Set context_size explicitly on the four NVFP4 gallery entries (qwen3.6-35b-a3b-nvfp4-mtp, qwopus3.6-27b-v2-mtp-nvfp4, qwopus3.6-27b-coder-mtp-nvfp4, qwen3.6-27b-nvfp4-mtp) so the parser failure is irrelevant for them. 32768 matches sibling Qwen entries and is safe on memory; operators can raise it toward the 262144 train length. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- core/config/defaults.go | 10 ++++------ core/config/gguf.go | 2 +- core/config/hooks_llamacpp.go | 2 +- core/config/hooks_test.go | 4 ++++ gallery/index.yaml | 14 ++++++++++++++ 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/core/config/defaults.go b/core/config/defaults.go index 18625fab3..bb993d075 100644 --- a/core/config/defaults.go +++ b/core/config/defaults.go @@ -12,14 +12,12 @@ package config // these; config never imports backend. const ( // DefaultContextSize is the fallback context window when none is configured - // or estimable from the model. + // or estimable from the model. It is also the fallback for a GGUF whose + // metadata yields no usable estimate or that the parser cannot read at all + // (e.g. a quant type it does not know, such as NVFP4): a model-agnostic + // safe default beats a tiny, surprising window that truncates real prompts. DefaultContextSize = 4096 - // GGUFFallbackContextSize is the context window for a GGUF model whose - // metadata yields no usable estimate (see guessGGUFFromFile). Deliberately - // smaller than DefaultContextSize to stay conservative on memory there. - GGUFFallbackContextSize = 1024 - // DefaultNGPULayers means "offload all layers"; the backend (fit_params) // clamps to what actually fits in device memory. DefaultNGPULayers = 99999999 diff --git a/core/config/gguf.go b/core/config/gguf.go index 16e43c914..177e68749 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -33,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { cSize := int(ctxSize) cfg.ContextSize = &cSize } else { - defaultCtx = GGUFFallbackContextSize + defaultCtx = DefaultContextSize cfg.ContextSize = &defaultCtx } } diff --git a/core/config/hooks_llamacpp.go b/core/config/hooks_llamacpp.go index 09bdbe868..07ccdda7b 100644 --- a/core/config/hooks_llamacpp.go +++ b/core/config/hooks_llamacpp.go @@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) { // Default context size if not set, regardless of whether GGUF parsing succeeds defer func() { if cfg.ContextSize == nil { - ctx := GGUFFallbackContextSize + ctx := DefaultContextSize cfg.ContextSize = &ctx } }() diff --git a/core/config/hooks_test.go b/core/config/hooks_test.go index 6e18ad7cc..a1b30b8d9 100644 --- a/core/config/hooks_test.go +++ b/core/config/hooks_test.go @@ -248,7 +248,11 @@ var _ = Describe("Backend hooks and parser defaults", func() { } cfg.SetDefaults(ModelPath(dir)) + // An unreadable/unparseable GGUF (e.g. a quant type the parser does + // not know, such as NVFP4) yields no estimate, so the hook must fall + // back to DefaultContextSize rather than a tiny, surprising value. Expect(cfg.ContextSize).NotTo(BeNil()) + Expect(*cfg.ContextSize).To(Equal(DefaultContextSize)) }) }) diff --git a/gallery/index.yaml b/gallery/index.yaml index cc975a83a..f39993333 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -579,6 +579,10 @@ icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3.6/Figures/qwen3.6_35b_a3b_score.png overrides: backend: llama-cpp + # NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so + # context size cannot be auto-derived; set it explicitly (the model trains + # to 262144, 32768 is a safe default operators can raise). + context_size: 32768 function: automatic_tool_parsing_fallback: true grammar: @@ -611,6 +615,9 @@ - gguf overrides: backend: llama-cpp + # NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so + # context size cannot be auto-derived; set it explicitly. + context_size: 32768 function: automatic_tool_parsing_fallback: true grammar: @@ -638,6 +645,9 @@ icon: https://cdn-uploads.huggingface.co/production/uploads/66309bd090589b7c65950665/sGQKmrMc6L6guMoaB5_Y2.png overrides: backend: llama-cpp + # NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so + # context size cannot be auto-derived; set it explicitly. + context_size: 32768 function: automatic_tool_parsing_fallback: true grammar: @@ -688,6 +698,10 @@ icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3.6/Figures/qwen3.6_27b_score.png overrides: backend: llama-cpp + # NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so + # context size cannot be auto-derived; set it explicitly (the model trains + # to 262144, 32768 is a safe default operators can raise). + context_size: 32768 function: automatic_tool_parsing_fallback: true grammar: