mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-28 02:17:00 -04:00
fix(config): fall back to DefaultContextSize for unparseable GGUFs; pin NVFP4 gallery context_size (#10563)
The GGUF metadata parser (gpustack/gguf-parser-go) cannot read NVFP4-quantized GGUFs at all: it errors with "read tensor info 0: This quantized type is currently unsupported" because NVFP4 is a ggml tensor type it does not know. When ParseGGUFFile errors, the llama-cpp defaults hook skips guessGGUFFromFile entirely and the deferred fallback sets the context window to the conservative GGUFFallbackContextSize (1024). The result: a model that trains to 262144 tokens runs with n_ctx=1024, and every prompt over ~1k tokens fails with "request (N tokens) exceeds the available context size (1024 tokens)". Two changes: - Drop GGUFFallbackContextSize (1024) and fall back to DefaultContextSize (4096) in both the GGUF run-estimate path (gguf.go) and the deferred hook fallback (hooks_llamacpp.go). 1024 is a sensible floor for a tiny CPU GGUF but a footgun for a large, long-context model whose header simply cannot be parsed. Strengthen the existing "GGUF unreadable" test to assert the value. - Set context_size explicitly on the four NVFP4 gallery entries (qwen3.6-35b-a3b-nvfp4-mtp, qwopus3.6-27b-v2-mtp-nvfp4, qwopus3.6-27b-coder-mtp-nvfp4, qwen3.6-27b-nvfp4-mtp) so the parser failure is irrelevant for them. 32768 matches sibling Qwen entries and is safe on memory; operators can raise it toward the 262144 train length. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -12,14 +12,12 @@ package config
|
||||
// these; config never imports backend.
|
||||
const (
|
||||
// DefaultContextSize is the fallback context window when none is configured
|
||||
// or estimable from the model.
|
||||
// or estimable from the model. It is also the fallback for a GGUF whose
|
||||
// metadata yields no usable estimate or that the parser cannot read at all
|
||||
// (e.g. a quant type it does not know, such as NVFP4): a model-agnostic
|
||||
// safe default beats a tiny, surprising window that truncates real prompts.
|
||||
DefaultContextSize = 4096
|
||||
|
||||
// GGUFFallbackContextSize is the context window for a GGUF model whose
|
||||
// metadata yields no usable estimate (see guessGGUFFromFile). Deliberately
|
||||
// smaller than DefaultContextSize to stay conservative on memory there.
|
||||
GGUFFallbackContextSize = 1024
|
||||
|
||||
// DefaultNGPULayers means "offload all layers"; the backend (fit_params)
|
||||
// clamps to what actually fits in device memory.
|
||||
DefaultNGPULayers = 99999999
|
||||
|
||||
@@ -33,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
|
||||
cSize := int(ctxSize)
|
||||
cfg.ContextSize = &cSize
|
||||
} else {
|
||||
defaultCtx = GGUFFallbackContextSize
|
||||
defaultCtx = DefaultContextSize
|
||||
cfg.ContextSize = &defaultCtx
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
|
||||
// Default context size if not set, regardless of whether GGUF parsing succeeds
|
||||
defer func() {
|
||||
if cfg.ContextSize == nil {
|
||||
ctx := GGUFFallbackContextSize
|
||||
ctx := DefaultContextSize
|
||||
cfg.ContextSize = &ctx
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -248,7 +248,11 @@ var _ = Describe("Backend hooks and parser defaults", func() {
|
||||
}
|
||||
cfg.SetDefaults(ModelPath(dir))
|
||||
|
||||
// An unreadable/unparseable GGUF (e.g. a quant type the parser does
|
||||
// not know, such as NVFP4) yields no estimate, so the hook must fall
|
||||
// back to DefaultContextSize rather than a tiny, surprising value.
|
||||
Expect(cfg.ContextSize).NotTo(BeNil())
|
||||
Expect(*cfg.ContextSize).To(Equal(DefaultContextSize))
|
||||
})
|
||||
})
|
||||
|
||||
|
||||
@@ -579,6 +579,10 @@
|
||||
icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3.6/Figures/qwen3.6_35b_a3b_score.png
|
||||
overrides:
|
||||
backend: llama-cpp
|
||||
# NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so
|
||||
# context size cannot be auto-derived; set it explicitly (the model trains
|
||||
# to 262144, 32768 is a safe default operators can raise).
|
||||
context_size: 32768
|
||||
function:
|
||||
automatic_tool_parsing_fallback: true
|
||||
grammar:
|
||||
@@ -611,6 +615,9 @@
|
||||
- gguf
|
||||
overrides:
|
||||
backend: llama-cpp
|
||||
# NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so
|
||||
# context size cannot be auto-derived; set it explicitly.
|
||||
context_size: 32768
|
||||
function:
|
||||
automatic_tool_parsing_fallback: true
|
||||
grammar:
|
||||
@@ -638,6 +645,9 @@
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/66309bd090589b7c65950665/sGQKmrMc6L6guMoaB5_Y2.png
|
||||
overrides:
|
||||
backend: llama-cpp
|
||||
# NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so
|
||||
# context size cannot be auto-derived; set it explicitly.
|
||||
context_size: 32768
|
||||
function:
|
||||
automatic_tool_parsing_fallback: true
|
||||
grammar:
|
||||
@@ -688,6 +698,10 @@
|
||||
icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3.6/Figures/qwen3.6_27b_score.png
|
||||
overrides:
|
||||
backend: llama-cpp
|
||||
# NVFP4 GGUFs use a quant type the GGUF metadata parser cannot read, so
|
||||
# context size cannot be auto-derived; set it explicitly (the model trains
|
||||
# to 262144, 32768 is a safe default operators can raise).
|
||||
context_size: 32768
|
||||
function:
|
||||
automatic_tool_parsing_fallback: true
|
||||
grammar:
|
||||
|
||||
Reference in New Issue
Block a user