mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-08 08:46:49 -04:00
fix(mtp): don't auto-enable self-spec MTP for draft-only assistant GGUFs (#10208)
Gemma4 MTP (ggml-org/llama.cpp#23398) registers the prediction head as a separate `gemma4-assistant` architecture. That assistant GGUF still carries `<arch>.nextn_predict_layers`, so the architecture-agnostic detection in HasEmbeddedMTPHead matched it and appended the `spec_type:draft-mtp` defaults. Unlike the DeepSeek/Qwen embedded-head models, an assistant checkpoint cannot self-speculate: it is a draft model that requires a paired target context (`ctx_other`) and throws if loaded alone. Auto-applying the self-spec defaults to a standalone assistant import therefore produces a broken config. Guard the detection against draft-only assistant architectures (the `-assistant` suffix is upstream's naming convention) so importing one no longer yields a self-speculation config. Two-model target+draft pairing remains expressible manually via `draft_model:` and is left to a follow-up. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -30,11 +30,26 @@ func MTPSpecOptions() []string {
|
||||
return out
|
||||
}
|
||||
|
||||
// HasEmbeddedMTPHead reports whether the parsed GGUF declares a Multi-Token
|
||||
// Prediction head. Detection reads `<arch>.nextn_predict_layers`, which is
|
||||
// what `gguf_writer.add_nextn_predict_layers(n)` emits in upstream's
|
||||
// isDraftOnlyAssistantArch reports whether an architecture names a standalone
|
||||
// MTP *draft* model rather than a self-speculating trunk. Upstream's Gemma4 MTP
|
||||
// (ggml-org/llama.cpp#23398) registers the head as a separate `gemma4-assistant`
|
||||
// architecture whose GGUF still carries `nextn_predict_layers`, but which cannot
|
||||
// run alone: it requires a paired target context (`ctx_other`). Such archs must
|
||||
// not trigger the embedded-head self-speculation defaults. The `-assistant`
|
||||
// suffix is upstream's naming convention for these draft-only checkpoints.
|
||||
func isDraftOnlyAssistantArch(arch string) bool {
|
||||
return strings.HasSuffix(arch, "-assistant")
|
||||
}
|
||||
|
||||
// HasEmbeddedMTPHead reports whether the parsed GGUF declares a self-speculating
|
||||
// Multi-Token Prediction head. Detection reads `<arch>.nextn_predict_layers`,
|
||||
// which is what `gguf_writer.add_nextn_predict_layers(n)` emits in upstream's
|
||||
// `conversion/qwen.py` MTP mixin. A positive layer count means the head is
|
||||
// present in the same GGUF as the trunk.
|
||||
//
|
||||
// Draft-only assistant architectures (e.g. Gemma4's `gemma4-assistant`) carry
|
||||
// the same key but are separate draft checkpoints meant to be paired with a
|
||||
// target model, so they are deliberately excluded here.
|
||||
func HasEmbeddedMTPHead(f *gguf.GGUFFile) (uint32, bool) {
|
||||
if f == nil {
|
||||
return 0, false
|
||||
@@ -43,6 +58,9 @@ func HasEmbeddedMTPHead(f *gguf.GGUFFile) (uint32, bool) {
|
||||
if arch == "" {
|
||||
return 0, false
|
||||
}
|
||||
if isDraftOnlyAssistantArch(arch) {
|
||||
return 0, false
|
||||
}
|
||||
v, ok := f.Header.MetadataKV.Get(arch + ".nextn_predict_layers")
|
||||
if !ok {
|
||||
return 0, false
|
||||
|
||||
@@ -3,10 +3,33 @@ package config_test
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/config"
|
||||
|
||||
gguf "github.com/gpustack/gguf-parser-go"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// ggufWithArch fabricates a minimal in-memory GGUF carrying the given
|
||||
// `general.architecture` and a positive `<arch>.nextn_predict_layers` count,
|
||||
// so HasEmbeddedMTPHead can be exercised without a real model file.
|
||||
func ggufWithArch(arch string, nextn uint32) *gguf.GGUFFile {
|
||||
return &gguf.GGUFFile{
|
||||
Header: gguf.GGUFHeader{
|
||||
MetadataKV: gguf.GGUFMetadataKVs{
|
||||
{
|
||||
Key: "general.architecture",
|
||||
ValueType: gguf.GGUFMetadataValueTypeString,
|
||||
Value: arch,
|
||||
},
|
||||
{
|
||||
Key: arch + ".nextn_predict_layers",
|
||||
ValueType: gguf.GGUFMetadataValueTypeUint32,
|
||||
Value: nextn,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
var _ = Describe("MTP auto-defaults", func() {
|
||||
Context("MTPSpecOptions", func() {
|
||||
It("returns the upstream-recommended speculative tuple", func() {
|
||||
@@ -82,5 +105,20 @@ var _ = Describe("MTP auto-defaults", func() {
|
||||
Expect(ok).To(BeFalse())
|
||||
Expect(n).To(BeZero())
|
||||
})
|
||||
|
||||
It("detects a same-GGUF embedded head (DeepSeek/Qwen style)", func() {
|
||||
n, ok := HasEmbeddedMTPHead(ggufWithArch("qwen3moe", 1))
|
||||
Expect(ok).To(BeTrue())
|
||||
Expect(n).To(Equal(uint32(1)))
|
||||
})
|
||||
|
||||
It("ignores a gemma4-assistant draft-only model", func() {
|
||||
// The assistant GGUF carries nextn_predict_layers but is a separate
|
||||
// draft model that requires a paired target (ctx_other); it cannot
|
||||
// self-speculate, so it must not trigger the embedded-head defaults.
|
||||
n, ok := HasEmbeddedMTPHead(ggufWithArch("gemma4-assistant", 48))
|
||||
Expect(ok).To(BeFalse())
|
||||
Expect(n).To(BeZero())
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user