fix(config): fall back to DefaultContextSize for unparseable GGUFs; pin NVFP4 gallery context_size (#10563)

The GGUF metadata parser (gpustack/gguf-parser-go) cannot read NVFP4-quantized
GGUFs at all: it errors with "read tensor info 0: This quantized type is
currently unsupported" because NVFP4 is a ggml tensor type it does not know.
When ParseGGUFFile errors, the llama-cpp defaults hook skips guessGGUFFromFile
entirely and the deferred fallback sets the context window to the conservative
GGUFFallbackContextSize (1024). The result: a model that trains to 262144
tokens runs with n_ctx=1024, and every prompt over ~1k tokens fails with
"request (N tokens) exceeds the available context size (1024 tokens)".

Two changes:

- Drop GGUFFallbackContextSize (1024) and fall back to DefaultContextSize
  (4096) in both the GGUF run-estimate path (gguf.go) and the deferred hook
  fallback (hooks_llamacpp.go). 1024 is a sensible floor for a tiny CPU GGUF
  but a footgun for a large, long-context model whose header simply cannot be
  parsed. Strengthen the existing "GGUF unreadable" test to assert the value.

- Set context_size explicitly on the four NVFP4 gallery entries
  (qwen3.6-35b-a3b-nvfp4-mtp, qwopus3.6-27b-v2-mtp-nvfp4,
  qwopus3.6-27b-coder-mtp-nvfp4, qwen3.6-27b-nvfp4-mtp) so the parser failure
  is irrelevant for them. 32768 matches sibling Qwen entries and is safe on
  memory; operators can raise it toward the 262144 train length.


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
LocalAI [bot]
2026-06-27 23:34:52 +02:00
committed by GitHub
parent 8aba4fdba3
commit 1154be5eea
5 changed files with 24 additions and 8 deletions

View File

@@ -12,14 +12,12 @@ package config
// these; config never imports backend.
const (
// DefaultContextSize is the fallback context window when none is configured
// or estimable from the model.
// or estimable from the model. It is also the fallback for a GGUF whose
// metadata yields no usable estimate or that the parser cannot read at all
// (e.g. a quant type it does not know, such as NVFP4): a model-agnostic
// safe default beats a tiny, surprising window that truncates real prompts.
DefaultContextSize = 4096
// GGUFFallbackContextSize is the context window for a GGUF model whose
// metadata yields no usable estimate (see guessGGUFFromFile). Deliberately
// smaller than DefaultContextSize to stay conservative on memory there.
GGUFFallbackContextSize = 1024
// DefaultNGPULayers means "offload all layers"; the backend (fit_params)
// clamps to what actually fits in device memory.
DefaultNGPULayers = 99999999

View File

@@ -33,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
cSize := int(ctxSize)
cfg.ContextSize = &cSize
} else {
defaultCtx = GGUFFallbackContextSize
defaultCtx = DefaultContextSize
cfg.ContextSize = &defaultCtx
}
}

View File

@@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
// Default context size if not set, regardless of whether GGUF parsing succeeds
defer func() {
if cfg.ContextSize == nil {
ctx := GGUFFallbackContextSize
ctx := DefaultContextSize
cfg.ContextSize = &ctx
}
}()

View File

@@ -248,7 +248,11 @@ var _ = Describe("Backend hooks and parser defaults", func() {
}
cfg.SetDefaults(ModelPath(dir))
// An unreadable/unparseable GGUF (e.g. a quant type the parser does
// not know, such as NVFP4) yields no estimate, so the hook must fall
// back to DefaultContextSize rather than a tiny, surprising value.
Expect(cfg.ContextSize).NotTo(BeNil())
Expect(*cfg.ContextSize).To(Equal(DefaultContextSize))
})
})

View File

@@ -579,6 +579,10 @@
icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3.6/Figures/qwen3.6_35b_a3b_score.png
overrides:
backend: llama-cpp
# NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so
# context size cannot be auto-derived; set it explicitly (the model trains
# to 262144, 32768 is a safe default operators can raise).
context_size: 32768
function:
automatic_tool_parsing_fallback: true
grammar:
@@ -611,6 +615,9 @@
- gguf
overrides:
backend: llama-cpp
# NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so
# context size cannot be auto-derived; set it explicitly.
context_size: 32768
function:
automatic_tool_parsing_fallback: true
grammar:
@@ -638,6 +645,9 @@
icon: https://cdn-uploads.huggingface.co/production/uploads/66309bd090589b7c65950665/sGQKmrMc6L6guMoaB5_Y2.png
overrides:
backend: llama-cpp
# NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so
# context size cannot be auto-derived; set it explicitly.
context_size: 32768
function:
automatic_tool_parsing_fallback: true
grammar:
@@ -688,6 +698,10 @@
icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3.6/Figures/qwen3.6_27b_score.png
overrides:
backend: llama-cpp
# NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so
# context size cannot be auto-derived; set it explicitly (the model trains
# to 262144, 32768 is a safe default operators can raise).
context_size: 32768
function:
automatic_tool_parsing_fallback: true
grammar: