diff --git a/.agents/llama-cpp-backend.md b/.agents/llama-cpp-backend.md index 1fc5765c9..67d09ced4 100644 --- a/.agents/llama-cpp-backend.md +++ b/.agents/llama-cpp-backend.md @@ -61,6 +61,12 @@ Always check `llama.cpp` for new model configuration options that should be supp - `reasoning_format` - Reasoning format options - Any new flags or parameters +### Speculative Decoding Types + +The `spec_type` option in `grpc-server.cpp` delegates to upstream's `common_speculative_types_from_names()`, so new speculative types added to the `common_speculative_type_from_name` map in `common/speculative.cpp` are picked up automatically with no code changes - only docs need an entry in `docs/content/advanced/model-configuration.md`. Current values: `none`, `draft-simple`, `draft-eagle3`, `draft-mtp`, `ngram-simple`, `ngram-map-k`, `ngram-map-k4v`, `ngram-mod`, `ngram-cache`. + +`draft-mtp` (Multi-Token Prediction, [ggml-org/llama.cpp#22673](https://github.com/ggml-org/llama.cpp/pull/22673)) does not need a separate draft GGUF: when `spec_type` includes `draft-mtp` and `draftmodel` is empty, the upstream server creates an MTP context off the target model itself. LocalAI's gRPC layer needs no changes for this — it works through the existing `params.speculative.types` plumbing and the derived `cparams.n_rs_seq = params.speculative.need_n_rs_seq()` in `common_context_params_to_llama`. + ### Implementation Guidelines 1. **Feature Parity**: Always aim for feature parity with llama.cpp's implementation diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 8b6a36d72..826f8851d 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -1,5 +1,5 @@ -LLAMA_VERSION?=1348f67c58f561808136e8a152a9eddec168f221 +LLAMA_VERSION?=0253fb21f595246f54c192fe8332f34173be251b LLAMA_REPO?=https://github.com/ggerganov/llama.cpp CMAKE_ARGS?= diff --git a/core/config/gguf.go b/core/config/gguf.go index 2d5d3f7c9..c373561b6 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -54,6 +54,13 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { cfg.modelTemplate = chatTemplate.ValueString() } + // Auto-enable Multi-Token Prediction (ggml-org/llama.cpp#22673) when the + // GGUF carries an embedded MTP head. Skipped silently for non-MTP models + // and when the user already configured a spec_type. + if n, ok := HasEmbeddedMTPHead(f); ok { + ApplyMTPDefaults(cfg, n) + } + // Thinking support detection is done after model load via DetectThinkingSupportFromBackend // template estimations diff --git a/core/config/mtp.go b/core/config/mtp.go new file mode 100644 index 000000000..8f9b5fffb --- /dev/null +++ b/core/config/mtp.go @@ -0,0 +1,84 @@ +package config + +import ( + "strings" + + gguf "github.com/gpustack/gguf-parser-go" + "github.com/mudler/xlog" +) + +// mtpSpecOptions lists the speculative-decoding option keys auto-applied when +// an MTP head is detected on a llama-cpp GGUF. Defaults track the upstream +// MTP PR (ggml-org/llama.cpp#22673): +// +// - spec_type:draft-mtp activates Multi-Token Prediction +// - spec_n_max:6 draft window +// - spec_p_min:0.75 pinned because upstream marked the 0.75 default +// with a "change to 0.0f" TODO; locking it here keeps acceptance +// thresholds stable across future bumps +var mtpSpecOptions = []string{ + "spec_type:draft-mtp", + "spec_n_max:6", + "spec_p_min:0.75", +} + +// MTPSpecOptions returns a copy of the option keys auto-applied when an MTP +// head is detected. Exported for testing and for the importer. +func MTPSpecOptions() []string { + out := make([]string, len(mtpSpecOptions)) + copy(out, mtpSpecOptions) + return out +} + +// HasEmbeddedMTPHead reports whether the parsed GGUF declares a Multi-Token +// Prediction head. Detection reads `.nextn_predict_layers`, which is +// what `gguf_writer.add_nextn_predict_layers(n)` emits in upstream's +// `conversion/qwen.py` MTP mixin. A positive layer count means the head is +// present in the same GGUF as the trunk. +func HasEmbeddedMTPHead(f *gguf.GGUFFile) (uint32, bool) { + if f == nil { + return 0, false + } + arch := f.Architecture().Architecture + if arch == "" { + return 0, false + } + v, ok := f.Header.MetadataKV.Get(arch + ".nextn_predict_layers") + if !ok { + return 0, false + } + n := gguf.ValueNumeric[uint32](v) + return n, n > 0 +} + +// hasSpecTypeOption returns true when the slice already contains a +// user-configured `spec_type:` / `speculative_type:` entry. Used to avoid +// clobbering an explicit choice with the MTP auto-defaults. +func hasSpecTypeOption(opts []string) bool { + for _, o := range opts { + if strings.HasPrefix(o, "spec_type:") || strings.HasPrefix(o, "speculative_type:") { + return true + } + } + return false +} + +// ApplyMTPDefaults appends the auto-MTP option keys to cfg.Options when none +// is already configured. It is a no-op when the user already picked a +// `spec_type` (either via YAML or via the importer's preferences flow). +// +// `layers` is the value read from `.nextn_predict_layers` and is only +// used for the diagnostic log line. +func ApplyMTPDefaults(cfg *ModelConfig, layers uint32) { + if cfg == nil { + return + } + if hasSpecTypeOption(cfg.Options) { + xlog.Debug("[mtp] embedded MTP head detected but spec_type already configured; leaving user choice intact", + "name", cfg.Name, "nextn_layers", layers) + return + } + cfg.Options = append(cfg.Options, mtpSpecOptions...) + xlog.Info("[mtp] embedded MTP head detected; enabling draft-mtp speculative decoding", + "name", cfg.Name, "nextn_layers", layers, "spec_n_max", 6, "spec_p_min", 0.75) +} diff --git a/core/config/mtp_test.go b/core/config/mtp_test.go new file mode 100644 index 000000000..283ae550b --- /dev/null +++ b/core/config/mtp_test.go @@ -0,0 +1,86 @@ +package config_test + +import ( + . "github.com/mudler/LocalAI/core/config" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("MTP auto-defaults", func() { + Context("MTPSpecOptions", func() { + It("returns the upstream-recommended speculative tuple", func() { + Expect(MTPSpecOptions()).To(Equal([]string{ + "spec_type:draft-mtp", + "spec_n_max:6", + "spec_p_min:0.75", + })) + }) + + It("returns a defensive copy so callers cannot mutate the package default", func() { + opts := MTPSpecOptions() + opts[0] = "spec_type:none" + Expect(MTPSpecOptions()[0]).To(Equal("spec_type:draft-mtp")) + }) + }) + + Context("ApplyMTPDefaults", func() { + It("appends MTP options when nothing is configured", func() { + cfg := &ModelConfig{Name: "qwen-mtp"} + ApplyMTPDefaults(cfg, 1) + Expect(cfg.Options).To(Equal([]string{ + "spec_type:draft-mtp", + "spec_n_max:6", + "spec_p_min:0.75", + })) + }) + + It("preserves unrelated options already on the config", func() { + cfg := &ModelConfig{ + Name: "qwen-mtp", + Options: []string{"use_jinja:true", "cache_reuse:256"}, + } + ApplyMTPDefaults(cfg, 1) + Expect(cfg.Options).To(Equal([]string{ + "use_jinja:true", + "cache_reuse:256", + "spec_type:draft-mtp", + "spec_n_max:6", + "spec_p_min:0.75", + })) + }) + + It("is a no-op when the user already configured spec_type", func() { + cfg := &ModelConfig{ + Name: "qwen-mtp", + Options: []string{"spec_type:ngram-simple", "use_jinja:true"}, + } + ApplyMTPDefaults(cfg, 1) + Expect(cfg.Options).To(Equal([]string{ + "spec_type:ngram-simple", + "use_jinja:true", + })) + }) + + It("also respects the legacy speculative_type alias", func() { + cfg := &ModelConfig{ + Name: "qwen-mtp", + Options: []string{"speculative_type:ngram-mod"}, + } + ApplyMTPDefaults(cfg, 1) + Expect(cfg.Options).To(Equal([]string{"speculative_type:ngram-mod"})) + }) + + It("tolerates a nil config", func() { + Expect(func() { ApplyMTPDefaults(nil, 1) }).ToNot(Panic()) + }) + }) + + Context("HasEmbeddedMTPHead", func() { + It("returns false on a nil GGUF file", func() { + n, ok := HasEmbeddedMTPHead(nil) + Expect(ok).To(BeFalse()) + Expect(n).To(BeZero()) + }) + }) +}) diff --git a/core/gallery/importers/llama-cpp.go b/core/gallery/importers/llama-cpp.go index 8771b106d..39a732560 100644 --- a/core/gallery/importers/llama-cpp.go +++ b/core/gallery/importers/llama-cpp.go @@ -1,10 +1,13 @@ package importers import ( + "context" "encoding/json" "path/filepath" "strings" + "time" + gguf "github.com/gpustack/gguf-parser-go" "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/gallery" "github.com/mudler/LocalAI/core/schema" @@ -261,6 +264,13 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error) // Apply per-model-family inference parameter defaults config.ApplyInferenceDefaults(&modelConfig, details.URI) + // Auto-detect Multi-Token Prediction heads (ggml-org/llama.cpp#22673) and + // enable speculative decoding. Mirrors the load-time hook so freshly + // imported configs already carry spec_type:draft-mtp before the model is + // ever loaded - users see it in the YAML preview rather than discovering + // it after the first start. + maybeApplyMTPDefaults(&modelConfig, details, &cfg) + data, err := yaml.Marshal(modelConfig) if err != nil { return gallery.ModelConfig{}, err @@ -291,6 +301,85 @@ func pickPreferredGroup(groups []hfapi.ShardGroup, prefs []string) *hfapi.ShardG return &groups[len(groups)-1] } +// maybeApplyMTPDefaults parses the picked GGUF header (range-fetched over +// HTTP for HF/URL imports) and, if the file declares a Multi-Token Prediction +// head, appends the auto-MTP option keys to modelConfig.Options. Failures +// during the probe are non-fatal: the importer keeps the config without MTP +// so an unrelated network blip or weird header doesn't break the import. +// +// OCI/Ollama URIs are skipped because the artifact isn't directly fetchable +// as a GGUF byte stream - the load-time hook (core/config/gguf.go) covers +// those once the model is materialised on disk. +func maybeApplyMTPDefaults(modelConfig *config.ModelConfig, details Details, cfg *gallery.ModelConfig) { + probeURL := pickMTPProbeURL(details, cfg) + if probeURL == "" { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + defer func() { + if r := recover(); r != nil { + xlog.Debug("[mtp-importer] panic while probing GGUF header", "uri", probeURL, "recover", r) + } + }() + + f, err := gguf.ParseGGUFFileRemote(ctx, probeURL) + if err != nil { + xlog.Debug("[mtp-importer] failed to read remote GGUF header for MTP detection", "uri", probeURL, "error", err) + return + } + + n, ok := config.HasEmbeddedMTPHead(f) + if !ok { + return + } + config.ApplyMTPDefaults(modelConfig, n) +} + +// pickMTPProbeURL returns an HTTP(S) URL pointing at the main (non-mmproj) +// GGUF shard that should be inspected for an MTP head, or "" when no +// suitable URL is available. Custom URI schemes (`huggingface://`, +// `ollama://`, etc.) are run through `downloader.URI.ResolveURL` so the +// resulting URL is something `gguf.ParseGGUFFileRemote` can actually open. +// OCI/Ollama URIs are skipped because the artifact is not directly +// streamable as a GGUF byte range. +func pickMTPProbeURL(details Details, cfg *gallery.ModelConfig) string { + uri := downloader.URI(details.URI) + + if uri.LooksLikeOCI() { + return "" + } + + if strings.HasSuffix(strings.ToLower(details.URI), ".gguf") { + return resolveHTTPProbe(details.URI) + } + + for _, f := range cfg.Files { + lower := strings.ToLower(f.Filename) + if strings.Contains(lower, "mmproj") { + continue + } + if !strings.HasSuffix(lower, ".gguf") { + continue + } + return resolveHTTPProbe(f.URI) + } + return "" +} + +// resolveHTTPProbe resolves an importer-side URI to the HTTP(S) URL that +// `gguf.ParseGGUFFileRemote` can range-fetch. Returns "" if the URI can't +// be reduced to an HTTP(S) endpoint (e.g. local path, unsupported scheme). +func resolveHTTPProbe(uri string) string { + resolved := downloader.URI(uri).ResolveURL() + if downloader.URI(resolved).LooksLikeHTTPURL() { + return resolved + } + return "" +} + // appendShardGroup copies every shard of group into cfg.Files under dest, // skipping any entry whose target filename is already present so repeated // calls (e.g. the rare case of mmproj + model picking the same group) diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md index 876c9e062..b53c33858 100644 --- a/docs/content/advanced/model-configuration.md +++ b/docs/content/advanced/model-configuration.md @@ -323,6 +323,7 @@ The canonical names match upstream llama.cpp (dash-separated). For backward comp | `none` | | No speculative decoding (default) | | `draft-simple` | `draft`, `draft_simple` | Draft model-based speculation (auto-set when `draft_model` is configured) | | `draft-eagle3` | `eagle3`, `draft_eagle3` | EAGLE3 draft model architecture | +| `draft-mtp` | `draft_mtp` | Multi-Token Prediction. Reuses the target model's embedded MTP head; no separate draft GGUF required (`draft_model` can be omitted). | | `ngram-simple` | `ngram_simple` | Simple self-speculative using token history | | `ngram-map-k` | `ngram_map_k` | N-gram with key-only map | | `ngram-map-k4v` | `ngram_map_k4v` | N-gram with keys and 4 m-gram values | @@ -335,6 +336,71 @@ Multiple types can be chained by passing a comma-separated list to `spec_type` ( Speculative decoding is automatically disabled when multimodal models (with `mmproj`) are active. The `n_draft` parameter can also be overridden per-request. {{% /notice %}} +##### Multi-Token Prediction (MTP) + +`draft-mtp` enables [Multi-Token Prediction](https://github.com/ggml-org/llama.cpp/pull/22673) (ggml-org/llama.cpp#22673). MTP uses a small prediction head trained into the target model: the head runs alongside the main forward pass and proposes the next few tokens, which the target then verifies in a single batched step. Upstream reports ~1.85x-2.1x token throughput at ~72-82% draft acceptance on Qwen3.6 27B / 35B A3B. + +**Auto-detection (default).** When a GGUF declares an MTP head (the upstream `.nextn_predict_layers` metadata key, set by `convert_hf_to_gguf.py` for Qwen3.5/3.6 family models and similar), LocalAI auto-enables MTP with the following defaults: + +```yaml +options: + - spec_type:draft-mtp + - spec_n_max:6 + - spec_p_min:0.75 +``` + +Detection runs both at **import time** (the `/import-model` UI / `POST /models/import-uri` flow range-fetches the GGUF header and writes the options into the generated YAML before you save it) and at **load time** (every llama-cpp model start re-checks the local header and appends the options if `spec_type` isn't already set). To opt out, set an explicit `spec_type:` / `speculative_type:` in your YAML - auto-detection always preserves the user value, including `spec_type:none`. + +**Two ways to load the MTP head:** + +1. **Embedded in the target GGUF** (the recommended path for LocalAI, and what auto-detection assumes). When `spec_type` includes `draft-mtp` and `draft_model` is empty, the backend builds the MTP draft context directly from the target model's weights. The GGUF must have been converted with the MTP tensors included. +2. **Separate `mtp-*.gguf` sibling file.** If you point `draft_model` at the separate MTP-head GGUF that ships next to the main weights on HuggingFace, the backend will load it as a draft model. Note: upstream's `-hf` auto-discovery of `mtp-*.gguf` siblings is **not** wired into LocalAI's gRPC layer - you need to download the sibling file and configure `draft_model` explicitly. + +**Manual override knobs** (overlap with the auto-detect defaults above): + +| Option | Recommended | Notes | +|--------|------------|-------| +| `spec_type` | `draft-mtp` | Activates MTP. Can be chained with other types (see below). | +| `spec_n_max` / `draft_max` | `2`-`6` | Number of draft tokens per step. Upstream's PR suggests 2-3 for the tightest acceptance window; LocalAI's auto-default is 6 to favour throughput on models with high acceptance. | +| `spec_p_min` | `0.75` | Pinned because upstream marks the current default with a "change to 0.0f" TODO; locking it here keeps acceptance thresholds stable across future llama.cpp bumps. | +| `mmproj_use_gpu` | `false` (or unset `mmproj`) | MTP has a prompt-processing overhead; if the model is non-vision, drop the mmproj entirely to save VRAM. | + +**Minimal config** (override-only, since auto-detection already covers this for MTP-capable GGUFs): + +```yaml +name: qwen3-mtp +backend: llama-cpp +parameters: + model: qwen3-27b-with-mtp.gguf +options: + - spec_type:draft-mtp + - spec_n_max:3 +``` + +**With a separate MTP head file:** + +```yaml +name: qwen3-mtp +backend: llama-cpp +parameters: + model: qwen3-27b.gguf + draft_model: qwen3-27b-mtp-head.gguf +options: + - spec_type:draft-mtp + - spec_n_max:3 +``` + +**Chaining MTP with n-gram fallback** (experimental, from the PR's usage notes - useful when MTP acceptance drops on highly repetitive output): + +```yaml +options: + - spec_type:draft-mtp,ngram-mod + - spec_n_max:3 + - spec_ngram_mod_n_match:24 +``` + +Pre-converted GGUFs with MTP heads are published on the [ggml-org HuggingFace org](https://huggingface.co/ggml-org) (initially Qwen3.6 27B and Qwen3.6 35B A3B). + ### Reasoning Models (DeepSeek-R1, Qwen3, etc.) These load-time options control how the backend parses `` reasoning blocks and how much budget the model is allowed for thinking. They are set per model via the `options:` array.