Files
LocalAI/core/config/gguf.go
LocalAI [bot] d77a9137d8 feat(llama-cpp): bump to MTP-merge SHA and automatically set MTP defaults (#9852)
* feat(llama-cpp): bump to MTP-merge SHA and document draft-mtp spec type

Update LLAMA_VERSION to 0253fb21 (post ggml-org/llama.cpp#22673 merge,
2026-05-16) to pick up Multi-Token Prediction support.

No grpc-server.cpp changes are required: the existing `spec_type` option
delegates to upstream's `common_speculative_types_from_names()`, which
already accepts the new `draft-mtp` name. The `n_rs_seq` cparam needed
by MTP is auto-derived inside `common_context_params_to_llama` from
`params.speculative.need_n_rs_seq()`, and when no `draft_model` is set
the upstream server builds the MTP context off the target model itself.

Docs: extend the speculative-decoding section of the model-configuration
guide with the new type, both load paths (MTP head embedded in the main
GGUF vs. separate `mtp-*.gguf` sibling), the PR's recommended
`spec_n_max:2-3`, and the chained `draft-mtp,ngram-mod` recipe. Also
notes that the upstream `-hf` auto-discovery of `mtp-*.gguf` siblings is
not wired through LocalAI's gRPC layer.

Agent guide: short note explaining that new upstream spec types are
picked up automatically and that MTP needs no gRPC plumbing.

Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(llama-cpp): auto-detect MTP heads and enable draft-mtp on import + load

Detect upstream's `<arch>.nextn_predict_layers` GGUF metadata key (set by
`convert_hf_to_gguf.py` for Qwen3.5/3.6 family models and similar) and,
when present and the user has not configured a `spec_type` explicitly,
auto-append the upstream-recommended speculative-decoding tuple:

  - spec_type:draft-mtp
  - spec_n_max:6
  - spec_p_min:0.75

The 0.75 p_min is pinned defensively because upstream marks the current
default with a "change to 0.0f" TODO; locking it here keeps acceptance
thresholds stable across future llama.cpp bumps.

Detection runs in two places:

  - The model importer (`POST /models/import-uri`, the `/import-model`
    UI) range-fetches the GGUF header for HuggingFace / direct-URL
    imports via `gguf.ParseGGUFFileRemote`, with a 30s timeout and
    non-fatal error handling. OCI/Ollama URIs are skipped because the
    artifact is not directly streamable; the load-time hook covers them
    once the file is on disk.
  - The llama-cpp load-time hook (`guessGGUFFromFile`) reads the local
    header on every model start and appends the same options if
    `spec_type` is not already set.

Both paths share `ApplyMTPDefaults` and respect an explicit user-set
`spec_type:` / `speculative_type:` so YAML overrides win. Ginkgo
specs cover the append, preserve-user-choice, legacy alias, and nil
safety paths.

Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(importer): resolve huggingface:// URIs before MTP header probe

`gguf.ParseGGUFFileRemote` only speaks HTTP(S), but the importer was
handing it the raw `huggingface://...` URI directly (and similarly for
any other custom downloader scheme). Live-test against
`huggingface://ggml-org/Qwen3.6-27B-MTP-GGUF/Qwen3.6-27B-MTP-Q8_0.gguf`
exposed this: the probe failed with `unsupported protocol scheme
"huggingface"`, was caught by the non-fatal error path, and the MTP
options were silently never applied to the generated YAML.

Route every candidate URI through `downloader.URI.ResolveURL()` and
require the resolved form to be HTTP(S). After the fix the probe
successfully reads `<arch>.nextn_predict_layers=1` from the real HF
GGUF and the emitted ConfigFile carries spec_type:draft-mtp,
spec_n_max:6, spec_p_min:0.75 as intended.

Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-16 22:42:48 +02:00

209 lines
7.9 KiB
Go

package config
import (
"context"
"github.com/mudler/LocalAI/pkg/functions"
"github.com/mudler/LocalAI/pkg/grpc"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/reasoning"
"github.com/mudler/LocalAI/pkg/xsysinfo"
"github.com/mudler/xlog"
gguf "github.com/gpustack/gguf-parser-go"
"github.com/gpustack/gguf-parser-go/util/ptr"
)
const (
defaultContextSize = 1024
defaultNGPULayers = 99999999
)
func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
if defaultCtx == 0 && cfg.ContextSize == nil {
ctxSize := f.EstimateLLaMACppRun().ContextSize
if ctxSize > 0 {
cSize := int(ctxSize)
cfg.ContextSize = &cSize
} else {
defaultCtx = defaultContextSize
cfg.ContextSize = &defaultCtx
}
}
// GPU options
if cfg.Options == nil {
if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
cfg.Options = []string{"gpu"}
}
}
if cfg.NGPULayers == nil {
// we assume we want to offload all layers
defaultHigh := defaultNGPULayers
cfg.NGPULayers = &defaultHigh
}
xlog.Debug("[gguf] guessDefaultsFromFile: NGPULayers set", "NGPULayers", cfg.NGPULayers, "modelName", f.Metadata().Name)
// identify from well known templates first, otherwise use the raw jinja template
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
if found {
// fill jinja template
cfg.modelTemplate = chatTemplate.ValueString()
}
// Auto-enable Multi-Token Prediction (ggml-org/llama.cpp#22673) when the
// GGUF carries an embedded MTP head. Skipped silently for non-MTP models
// and when the user already configured a spec_type.
if n, ok := HasEmbeddedMTPHead(f); ok {
ApplyMTPDefaults(cfg, n)
}
// Thinking support detection is done after model load via DetectThinkingSupportFromBackend
// template estimations
if cfg.HasTemplate() {
// nothing to guess here
xlog.Debug("[gguf] guessDefaultsFromFile: template already set", "name", cfg.Name, "modelName", f.Metadata().Name)
return
}
xlog.Debug("[gguf] Model file loaded", "file", cfg.ModelFileName(), "eosTokenID", f.Tokenizer().EOSTokenID, "bosTokenID", f.Tokenizer().BOSTokenID, "modelName", f.Metadata().Name, "architecture", f.Architecture().Architecture)
// guess the name
if cfg.Name == "" {
cfg.Name = f.Metadata().Name
}
// Instruct to use template from llama.cpp
cfg.TemplateConfig.UseTokenizerTemplate = true
cfg.FunctionsConfig.GrammarConfig.NoGrammar = true
cfg.Options = append(cfg.Options, "use_jinja:true")
cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT")
// Apply per-model-family inference parameter defaults (temperature, top_p, etc.)
ApplyInferenceDefaults(cfg, f.Metadata().Name)
}
// DetectThinkingSupportFromBackend calls the ModelMetadata gRPC method to detect
// if the model supports thinking mode and if the template ends with a thinking start token.
// This should be called after the model is loaded.
// The results are stored in cfg.SupportsThinking and cfg.ThinkingForcedOpen.
// The backend-reported multimodal marker is also captured into cfg.MediaMarker.
func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, backendClient grpc.Backend, modelOptions *pb.ModelOptions) {
if backendClient == nil {
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: backend client is nil, skipping detection")
return
}
if modelOptions == nil {
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: model options is nil, skipping detection")
return
}
// Only llama-cpp exposes ModelMetadata today. Other backends will either error
// or return an empty response — both are fine, we just bail before calling.
if cfg.Backend != "llama-cpp" {
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend)
return
}
metadata, err := backendClient.ModelMetadata(ctx, modelOptions)
if err != nil {
xlog.Warn("[gguf] DetectThinkingSupportFromBackend: failed to get model metadata", "error", err)
return
}
if metadata != nil {
// The multimodal media marker is backend-controlled (llama.cpp may pick a
// random per-server string). Empty means "no mtmd context" — Go falls back
// to templates.DefaultMultiMediaMarker at render time.
if metadata.MediaMarker != "" {
cfg.MediaMarker = metadata.MediaMarker
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: media marker captured", "marker", metadata.MediaMarker)
}
// Thinking / tool-format detection only applies when we rely on the
// backend-side tokenizer template — otherwise the rendered-template based
// heuristics below aren't meaningful.
if !cfg.TemplateConfig.UseTokenizerTemplate {
return
}
applyDetectedThinkingConfig(cfg, metadata)
// Extract tool format markers from autoparser analysis
if tf := metadata.GetToolFormat(); tf != nil && tf.FormatType != "" {
cfg.FunctionsConfig.ToolFormatMarkers = &functions.ToolFormatMarkers{
FormatType: tf.FormatType,
SectionStart: tf.SectionStart,
SectionEnd: tf.SectionEnd,
PerCallStart: tf.PerCallStart,
PerCallEnd: tf.PerCallEnd,
FuncNamePrefix: tf.FuncNamePrefix,
FuncNameSuffix: tf.FuncNameSuffix,
FuncClose: tf.FuncClose,
ArgNamePrefix: tf.ArgNamePrefix,
ArgNameSuffix: tf.ArgNameSuffix,
ArgValuePrefix: tf.ArgValuePrefix,
ArgValueSuffix: tf.ArgValueSuffix,
ArgSeparator: tf.ArgSeparator,
ArgsStart: tf.ArgsStart,
ArgsEnd: tf.ArgsEnd,
NameField: tf.NameField,
ArgsField: tf.ArgsField,
IDField: tf.IdField,
FunNameIsKey: tf.FunNameIsKey,
ToolsArrayWrapped: tf.ToolsArrayWrapped,
FunctionField: tf.FunctionField,
ParameterOrder: tf.ParameterOrder,
GenIDField: tf.GenIdField,
CallIDPosition: tf.CallIdPosition,
CallIDPrefix: tf.CallIdPrefix,
CallIDSuffix: tf.CallIdSuffix,
ReasoningStart: tf.ReasoningStart,
ReasoningEnd: tf.ReasoningEnd,
ContentStart: tf.ContentStart,
ContentEnd: tf.ContentEnd,
}
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: tool format markers detected",
"format_type", tf.FormatType,
"section_start", tf.SectionStart,
"func_name_prefix", tf.FuncNamePrefix)
}
}
}
func applyDetectedThinkingConfig(cfg *ModelConfig, metadata *pb.ModelMetadataResponse) {
if cfg == nil || metadata == nil {
return
}
// Respect explicit YAML/user config. Backend probing should only fill defaults
// when the reasoning mode has not already been set.
if cfg.ReasoningConfig.DisableReasoning == nil {
cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking)
}
// Respect explicit prefill config for the same reason. Only infer the
// default prefill behavior when the user did not set it.
if cfg.ReasoningConfig.DisableReasoningTagPrefill == nil {
// Use the rendered template to detect if thinking token is at the end.
// This reuses the existing DetectThinkingStartToken function.
if metadata.RenderedTemplate != "" {
thinkingStartToken := reasoning.DetectThinkingStartToken(metadata.RenderedTemplate, &cfg.ReasoningConfig)
thinkingForcedOpen := thinkingStartToken != ""
cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(!thinkingForcedOpen)
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", thinkingForcedOpen, "thinking_start_token", thinkingStartToken)
} else {
cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true)
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", false)
}
return
}
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: preserving explicit reasoning config", "supports_thinking", metadata.SupportsThinking, "disable_reasoning", *cfg.ReasoningConfig.DisableReasoning, "disable_reasoning_tag_prefill", *cfg.ReasoningConfig.DisableReasoningTagPrefill)
}