Files
LocalAI/core/config/mtp.go
LocalAI [bot] 67f80a152b fix(mtp): don't auto-enable self-spec MTP for draft-only assistant GGUFs (#10208)
Gemma4 MTP (ggml-org/llama.cpp#23398) registers the prediction head as a
separate `gemma4-assistant` architecture. That assistant GGUF still carries
`<arch>.nextn_predict_layers`, so the architecture-agnostic detection in
HasEmbeddedMTPHead matched it and appended the `spec_type:draft-mtp` defaults.

Unlike the DeepSeek/Qwen embedded-head models, an assistant checkpoint cannot
self-speculate: it is a draft model that requires a paired target context
(`ctx_other`) and throws if loaded alone. Auto-applying the self-spec defaults
to a standalone assistant import therefore produces a broken config.

Guard the detection against draft-only assistant architectures (the `-assistant`
suffix is upstream's naming convention) so importing one no longer yields a
self-speculation config. Two-model target+draft pairing remains expressible
manually via `draft_model:` and is left to a follow-up.


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-07 22:09:02 +02:00

103 lines
3.7 KiB
Go

package config
import (
"strings"
gguf "github.com/gpustack/gguf-parser-go"
"github.com/mudler/xlog"
)
// mtpSpecOptions lists the speculative-decoding option keys auto-applied when
// an MTP head is detected on a llama-cpp GGUF. Defaults track the upstream
// MTP PR (ggml-org/llama.cpp#22673):
//
// - spec_type:draft-mtp activates Multi-Token Prediction
// - spec_n_max:6 draft window
// - spec_p_min:0.75 pinned because upstream marked the 0.75 default
// with a "change to 0.0f" TODO; locking it here keeps acceptance
// thresholds stable across future bumps
var mtpSpecOptions = []string{
"spec_type:draft-mtp",
"spec_n_max:6",
"spec_p_min:0.75",
}
// MTPSpecOptions returns a copy of the option keys auto-applied when an MTP
// head is detected. Exported for testing and for the importer.
func MTPSpecOptions() []string {
out := make([]string, len(mtpSpecOptions))
copy(out, mtpSpecOptions)
return out
}
// isDraftOnlyAssistantArch reports whether an architecture names a standalone
// MTP *draft* model rather than a self-speculating trunk. Upstream's Gemma4 MTP
// (ggml-org/llama.cpp#23398) registers the head as a separate `gemma4-assistant`
// architecture whose GGUF still carries `nextn_predict_layers`, but which cannot
// run alone: it requires a paired target context (`ctx_other`). Such archs must
// not trigger the embedded-head self-speculation defaults. The `-assistant`
// suffix is upstream's naming convention for these draft-only checkpoints.
func isDraftOnlyAssistantArch(arch string) bool {
return strings.HasSuffix(arch, "-assistant")
}
// HasEmbeddedMTPHead reports whether the parsed GGUF declares a self-speculating
// Multi-Token Prediction head. Detection reads `<arch>.nextn_predict_layers`,
// which is what `gguf_writer.add_nextn_predict_layers(n)` emits in upstream's
// `conversion/qwen.py` MTP mixin. A positive layer count means the head is
// present in the same GGUF as the trunk.
//
// Draft-only assistant architectures (e.g. Gemma4's `gemma4-assistant`) carry
// the same key but are separate draft checkpoints meant to be paired with a
// target model, so they are deliberately excluded here.
func HasEmbeddedMTPHead(f *gguf.GGUFFile) (uint32, bool) {
if f == nil {
return 0, false
}
arch := f.Architecture().Architecture
if arch == "" {
return 0, false
}
if isDraftOnlyAssistantArch(arch) {
return 0, false
}
v, ok := f.Header.MetadataKV.Get(arch + ".nextn_predict_layers")
if !ok {
return 0, false
}
n := gguf.ValueNumeric[uint32](v)
return n, n > 0
}
// hasSpecTypeOption returns true when the slice already contains a
// user-configured `spec_type:` / `speculative_type:` entry. Used to avoid
// clobbering an explicit choice with the MTP auto-defaults.
func hasSpecTypeOption(opts []string) bool {
for _, o := range opts {
if strings.HasPrefix(o, "spec_type:") || strings.HasPrefix(o, "speculative_type:") {
return true
}
}
return false
}
// ApplyMTPDefaults appends the auto-MTP option keys to cfg.Options when none
// is already configured. It is a no-op when the user already picked a
// `spec_type` (either via YAML or via the importer's preferences flow).
//
// `layers` is the value read from `<arch>.nextn_predict_layers` and is only
// used for the diagnostic log line.
func ApplyMTPDefaults(cfg *ModelConfig, layers uint32) {
if cfg == nil {
return
}
if hasSpecTypeOption(cfg.Options) {
xlog.Debug("[mtp] embedded MTP head detected but spec_type already configured; leaving user choice intact",
"name", cfg.Name, "nextn_layers", layers)
return
}
cfg.Options = append(cfg.Options, mtpSpecOptions...)
xlog.Info("[mtp] embedded MTP head detected; enabling draft-mtp speculative decoding",
"name", cfg.Name, "nextn_layers", layers, "spec_n_max", 6, "spec_p_min", 0.75)
}