mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-09 01:07:09 -04:00
Gemma4 MTP (ggml-org/llama.cpp#23398) registers the prediction head as a separate `gemma4-assistant` architecture. That assistant GGUF still carries `<arch>.nextn_predict_layers`, so the architecture-agnostic detection in HasEmbeddedMTPHead matched it and appended the `spec_type:draft-mtp` defaults. Unlike the DeepSeek/Qwen embedded-head models, an assistant checkpoint cannot self-speculate: it is a draft model that requires a paired target context (`ctx_other`) and throws if loaded alone. Auto-applying the self-spec defaults to a standalone assistant import therefore produces a broken config. Guard the detection against draft-only assistant architectures (the `-assistant` suffix is upstream's naming convention) so importing one no longer yields a self-speculation config. Two-model target+draft pairing remains expressible manually via `draft_model:` and is left to a follow-up. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
125 lines
3.4 KiB
Go
125 lines
3.4 KiB
Go
package config_test
|
|
|
|
import (
|
|
. "github.com/mudler/LocalAI/core/config"
|
|
|
|
gguf "github.com/gpustack/gguf-parser-go"
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
// ggufWithArch fabricates a minimal in-memory GGUF carrying the given
|
|
// `general.architecture` and a positive `<arch>.nextn_predict_layers` count,
|
|
// so HasEmbeddedMTPHead can be exercised without a real model file.
|
|
func ggufWithArch(arch string, nextn uint32) *gguf.GGUFFile {
|
|
return &gguf.GGUFFile{
|
|
Header: gguf.GGUFHeader{
|
|
MetadataKV: gguf.GGUFMetadataKVs{
|
|
{
|
|
Key: "general.architecture",
|
|
ValueType: gguf.GGUFMetadataValueTypeString,
|
|
Value: arch,
|
|
},
|
|
{
|
|
Key: arch + ".nextn_predict_layers",
|
|
ValueType: gguf.GGUFMetadataValueTypeUint32,
|
|
Value: nextn,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
var _ = Describe("MTP auto-defaults", func() {
|
|
Context("MTPSpecOptions", func() {
|
|
It("returns the upstream-recommended speculative tuple", func() {
|
|
Expect(MTPSpecOptions()).To(Equal([]string{
|
|
"spec_type:draft-mtp",
|
|
"spec_n_max:6",
|
|
"spec_p_min:0.75",
|
|
}))
|
|
})
|
|
|
|
It("returns a defensive copy so callers cannot mutate the package default", func() {
|
|
opts := MTPSpecOptions()
|
|
opts[0] = "spec_type:none"
|
|
Expect(MTPSpecOptions()[0]).To(Equal("spec_type:draft-mtp"))
|
|
})
|
|
})
|
|
|
|
Context("ApplyMTPDefaults", func() {
|
|
It("appends MTP options when nothing is configured", func() {
|
|
cfg := &ModelConfig{Name: "qwen-mtp"}
|
|
ApplyMTPDefaults(cfg, 1)
|
|
Expect(cfg.Options).To(Equal([]string{
|
|
"spec_type:draft-mtp",
|
|
"spec_n_max:6",
|
|
"spec_p_min:0.75",
|
|
}))
|
|
})
|
|
|
|
It("preserves unrelated options already on the config", func() {
|
|
cfg := &ModelConfig{
|
|
Name: "qwen-mtp",
|
|
Options: []string{"use_jinja:true", "cache_reuse:256"},
|
|
}
|
|
ApplyMTPDefaults(cfg, 1)
|
|
Expect(cfg.Options).To(Equal([]string{
|
|
"use_jinja:true",
|
|
"cache_reuse:256",
|
|
"spec_type:draft-mtp",
|
|
"spec_n_max:6",
|
|
"spec_p_min:0.75",
|
|
}))
|
|
})
|
|
|
|
It("is a no-op when the user already configured spec_type", func() {
|
|
cfg := &ModelConfig{
|
|
Name: "qwen-mtp",
|
|
Options: []string{"spec_type:ngram-simple", "use_jinja:true"},
|
|
}
|
|
ApplyMTPDefaults(cfg, 1)
|
|
Expect(cfg.Options).To(Equal([]string{
|
|
"spec_type:ngram-simple",
|
|
"use_jinja:true",
|
|
}))
|
|
})
|
|
|
|
It("also respects the legacy speculative_type alias", func() {
|
|
cfg := &ModelConfig{
|
|
Name: "qwen-mtp",
|
|
Options: []string{"speculative_type:ngram-mod"},
|
|
}
|
|
ApplyMTPDefaults(cfg, 1)
|
|
Expect(cfg.Options).To(Equal([]string{"speculative_type:ngram-mod"}))
|
|
})
|
|
|
|
It("tolerates a nil config", func() {
|
|
Expect(func() { ApplyMTPDefaults(nil, 1) }).ToNot(Panic())
|
|
})
|
|
})
|
|
|
|
Context("HasEmbeddedMTPHead", func() {
|
|
It("returns false on a nil GGUF file", func() {
|
|
n, ok := HasEmbeddedMTPHead(nil)
|
|
Expect(ok).To(BeFalse())
|
|
Expect(n).To(BeZero())
|
|
})
|
|
|
|
It("detects a same-GGUF embedded head (DeepSeek/Qwen style)", func() {
|
|
n, ok := HasEmbeddedMTPHead(ggufWithArch("qwen3moe", 1))
|
|
Expect(ok).To(BeTrue())
|
|
Expect(n).To(Equal(uint32(1)))
|
|
})
|
|
|
|
It("ignores a gemma4-assistant draft-only model", func() {
|
|
// The assistant GGUF carries nextn_predict_layers but is a separate
|
|
// draft model that requires a paired target (ctx_other); it cannot
|
|
// self-speculate, so it must not trigger the embedded-head defaults.
|
|
n, ok := HasEmbeddedMTPHead(ggufWithArch("gemma4-assistant", 48))
|
|
Expect(ok).To(BeFalse())
|
|
Expect(n).To(BeZero())
|
|
})
|
|
})
|
|
})
|