From 67f80a152b072a5e2b37928a421b65a6bcf85305 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sun, 7 Jun 2026 22:09:02 +0200 Subject: [PATCH] fix(mtp): don't auto-enable self-spec MTP for draft-only assistant GGUFs (#10208) Gemma4 MTP (ggml-org/llama.cpp#23398) registers the prediction head as a separate `gemma4-assistant` architecture. That assistant GGUF still carries `.nextn_predict_layers`, so the architecture-agnostic detection in HasEmbeddedMTPHead matched it and appended the `spec_type:draft-mtp` defaults. Unlike the DeepSeek/Qwen embedded-head models, an assistant checkpoint cannot self-speculate: it is a draft model that requires a paired target context (`ctx_other`) and throws if loaded alone. Auto-applying the self-spec defaults to a standalone assistant import therefore produces a broken config. Guard the detection against draft-only assistant architectures (the `-assistant` suffix is upstream's naming convention) so importing one no longer yields a self-speculation config. Two-model target+draft pairing remains expressible manually via `draft_model:` and is left to a follow-up. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- core/config/mtp.go | 24 +++++++++++++++++++++--- core/config/mtp_test.go | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/core/config/mtp.go b/core/config/mtp.go index 8f9b5fffb..36b77146e 100644 --- a/core/config/mtp.go +++ b/core/config/mtp.go @@ -30,11 +30,26 @@ func MTPSpecOptions() []string { return out } -// HasEmbeddedMTPHead reports whether the parsed GGUF declares a Multi-Token -// Prediction head. Detection reads `.nextn_predict_layers`, which is -// what `gguf_writer.add_nextn_predict_layers(n)` emits in upstream's +// isDraftOnlyAssistantArch reports whether an architecture names a standalone +// MTP *draft* model rather than a self-speculating trunk. Upstream's Gemma4 MTP +// (ggml-org/llama.cpp#23398) registers the head as a separate `gemma4-assistant` +// architecture whose GGUF still carries `nextn_predict_layers`, but which cannot +// run alone: it requires a paired target context (`ctx_other`). Such archs must +// not trigger the embedded-head self-speculation defaults. The `-assistant` +// suffix is upstream's naming convention for these draft-only checkpoints. +func isDraftOnlyAssistantArch(arch string) bool { + return strings.HasSuffix(arch, "-assistant") +} + +// HasEmbeddedMTPHead reports whether the parsed GGUF declares a self-speculating +// Multi-Token Prediction head. Detection reads `.nextn_predict_layers`, +// which is what `gguf_writer.add_nextn_predict_layers(n)` emits in upstream's // `conversion/qwen.py` MTP mixin. A positive layer count means the head is // present in the same GGUF as the trunk. +// +// Draft-only assistant architectures (e.g. Gemma4's `gemma4-assistant`) carry +// the same key but are separate draft checkpoints meant to be paired with a +// target model, so they are deliberately excluded here. func HasEmbeddedMTPHead(f *gguf.GGUFFile) (uint32, bool) { if f == nil { return 0, false @@ -43,6 +58,9 @@ func HasEmbeddedMTPHead(f *gguf.GGUFFile) (uint32, bool) { if arch == "" { return 0, false } + if isDraftOnlyAssistantArch(arch) { + return 0, false + } v, ok := f.Header.MetadataKV.Get(arch + ".nextn_predict_layers") if !ok { return 0, false diff --git a/core/config/mtp_test.go b/core/config/mtp_test.go index 283ae550b..c21d9f733 100644 --- a/core/config/mtp_test.go +++ b/core/config/mtp_test.go @@ -3,10 +3,33 @@ package config_test import ( . "github.com/mudler/LocalAI/core/config" + gguf "github.com/gpustack/gguf-parser-go" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) +// ggufWithArch fabricates a minimal in-memory GGUF carrying the given +// `general.architecture` and a positive `.nextn_predict_layers` count, +// so HasEmbeddedMTPHead can be exercised without a real model file. +func ggufWithArch(arch string, nextn uint32) *gguf.GGUFFile { + return &gguf.GGUFFile{ + Header: gguf.GGUFHeader{ + MetadataKV: gguf.GGUFMetadataKVs{ + { + Key: "general.architecture", + ValueType: gguf.GGUFMetadataValueTypeString, + Value: arch, + }, + { + Key: arch + ".nextn_predict_layers", + ValueType: gguf.GGUFMetadataValueTypeUint32, + Value: nextn, + }, + }, + }, + } +} + var _ = Describe("MTP auto-defaults", func() { Context("MTPSpecOptions", func() { It("returns the upstream-recommended speculative tuple", func() { @@ -82,5 +105,20 @@ var _ = Describe("MTP auto-defaults", func() { Expect(ok).To(BeFalse()) Expect(n).To(BeZero()) }) + + It("detects a same-GGUF embedded head (DeepSeek/Qwen style)", func() { + n, ok := HasEmbeddedMTPHead(ggufWithArch("qwen3moe", 1)) + Expect(ok).To(BeTrue()) + Expect(n).To(Equal(uint32(1))) + }) + + It("ignores a gemma4-assistant draft-only model", func() { + // The assistant GGUF carries nextn_predict_layers but is a separate + // draft model that requires a paired target (ctx_other); it cannot + // self-speculate, so it must not trigger the embedded-head defaults. + n, ok := HasEmbeddedMTPHead(ggufWithArch("gemma4-assistant", 48)) + Expect(ok).To(BeFalse()) + Expect(n).To(BeZero()) + }) }) })