mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-18 05:33:09 -04:00
* feat(llama-cpp): bump to MTP-merge SHA and document draft-mtp spec type Update LLAMA_VERSION to 0253fb21 (post ggml-org/llama.cpp#22673 merge, 2026-05-16) to pick up Multi-Token Prediction support. No grpc-server.cpp changes are required: the existing `spec_type` option delegates to upstream's `common_speculative_types_from_names()`, which already accepts the new `draft-mtp` name. The `n_rs_seq` cparam needed by MTP is auto-derived inside `common_context_params_to_llama` from `params.speculative.need_n_rs_seq()`, and when no `draft_model` is set the upstream server builds the MTP context off the target model itself. Docs: extend the speculative-decoding section of the model-configuration guide with the new type, both load paths (MTP head embedded in the main GGUF vs. separate `mtp-*.gguf` sibling), the PR's recommended `spec_n_max:2-3`, and the chained `draft-mtp,ngram-mod` recipe. Also notes that the upstream `-hf` auto-discovery of `mtp-*.gguf` siblings is not wired through LocalAI's gRPC layer. Agent guide: short note explaining that new upstream spec types are picked up automatically and that MTP needs no gRPC plumbing. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(llama-cpp): auto-detect MTP heads and enable draft-mtp on import + load Detect upstream's `<arch>.nextn_predict_layers` GGUF metadata key (set by `convert_hf_to_gguf.py` for Qwen3.5/3.6 family models and similar) and, when present and the user has not configured a `spec_type` explicitly, auto-append the upstream-recommended speculative-decoding tuple: - spec_type:draft-mtp - spec_n_max:6 - spec_p_min:0.75 The 0.75 p_min is pinned defensively because upstream marks the current default with a "change to 0.0f" TODO; locking it here keeps acceptance thresholds stable across future llama.cpp bumps. Detection runs in two places: - The model importer (`POST /models/import-uri`, the `/import-model` UI) range-fetches the GGUF header for HuggingFace / direct-URL imports via `gguf.ParseGGUFFileRemote`, with a 30s timeout and non-fatal error handling. OCI/Ollama URIs are skipped because the artifact is not directly streamable; the load-time hook covers them once the file is on disk. - The llama-cpp load-time hook (`guessGGUFFromFile`) reads the local header on every model start and appends the same options if `spec_type` is not already set. Both paths share `ApplyMTPDefaults` and respect an explicit user-set `spec_type:` / `speculative_type:` so YAML overrides win. Ginkgo specs cover the append, preserve-user-choice, legacy alias, and nil safety paths. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(importer): resolve huggingface:// URIs before MTP header probe `gguf.ParseGGUFFileRemote` only speaks HTTP(S), but the importer was handing it the raw `huggingface://...` URI directly (and similarly for any other custom downloader scheme). Live-test against `huggingface://ggml-org/Qwen3.6-27B-MTP-GGUF/Qwen3.6-27B-MTP-Q8_0.gguf` exposed this: the probe failed with `unsupported protocol scheme "huggingface"`, was caught by the non-fatal error path, and the MTP options were silently never applied to the generated YAML. Route every candidate URI through `downloader.URI.ResolveURL()` and require the resolved form to be HTTP(S). After the fix the probe successfully reads `<arch>.nextn_predict_layers=1` from the real HF GGUF and the emitted ConfigFile carries spec_type:draft-mtp, spec_n_max:6, spec_p_min:0.75 as intended. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
407 lines
12 KiB
Go
407 lines
12 KiB
Go
package importers
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
gguf "github.com/gpustack/gguf-parser-go"
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/core/gallery"
|
|
"github.com/mudler/LocalAI/core/schema"
|
|
"github.com/mudler/LocalAI/pkg/downloader"
|
|
"github.com/mudler/LocalAI/pkg/functions"
|
|
hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
|
|
"github.com/mudler/xlog"
|
|
"go.yaml.in/yaml/v2"
|
|
)
|
|
|
|
var (
|
|
_ Importer = &LlamaCPPImporter{}
|
|
_ AdditionalBackendsProvider = &LlamaCPPImporter{}
|
|
)
|
|
|
|
type LlamaCPPImporter struct{}
|
|
|
|
func (i *LlamaCPPImporter) Name() string { return "llama-cpp" }
|
|
func (i *LlamaCPPImporter) Modality() string { return "text" }
|
|
func (i *LlamaCPPImporter) AutoDetects() bool { return true }
|
|
|
|
// AdditionalBackends advertises drop-in replacements that share the
|
|
// llama-cpp detection logic. They are preference-only: selecting one
|
|
// from the import form swaps the emitted YAML backend field but reuses
|
|
// the llama-cpp Match/Import pipeline.
|
|
func (i *LlamaCPPImporter) AdditionalBackends() []KnownBackendEntry {
|
|
return []KnownBackendEntry{
|
|
{Name: "ik-llama-cpp", Modality: "text", Description: "GGUF drop-in replacement for llama-cpp with ik-quants"},
|
|
{Name: "turboquant", Modality: "text", Description: "GGUF drop-in replacement for llama-cpp with TurboQuant optimizations"},
|
|
}
|
|
}
|
|
|
|
func (i *LlamaCPPImporter) Match(details Details) bool {
|
|
preferences, err := details.Preferences.MarshalJSON()
|
|
if err != nil {
|
|
xlog.Error("failed to marshal preferences", "error", err)
|
|
return false
|
|
}
|
|
|
|
preferencesMap := make(map[string]any)
|
|
|
|
if len(preferences) > 0 {
|
|
err = json.Unmarshal(preferences, &preferencesMap)
|
|
if err != nil {
|
|
xlog.Error("failed to unmarshal preferences", "error", err)
|
|
return false
|
|
}
|
|
}
|
|
|
|
uri := downloader.URI(details.URI)
|
|
|
|
if preferencesMap["backend"] == "llama-cpp" {
|
|
return true
|
|
}
|
|
|
|
if strings.HasSuffix(details.URI, ".gguf") {
|
|
return true
|
|
}
|
|
|
|
if uri.LooksLikeOCI() {
|
|
return true
|
|
}
|
|
|
|
if details.HuggingFace != nil {
|
|
for _, file := range details.HuggingFace.Files {
|
|
if strings.HasSuffix(file.Path, ".gguf") {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error) {
|
|
|
|
xlog.Debug("llama.cpp importer matched", "uri", details.URI)
|
|
|
|
preferences, err := details.Preferences.MarshalJSON()
|
|
if err != nil {
|
|
return gallery.ModelConfig{}, err
|
|
}
|
|
preferencesMap := make(map[string]any)
|
|
if len(preferences) > 0 {
|
|
err = json.Unmarshal(preferences, &preferencesMap)
|
|
if err != nil {
|
|
return gallery.ModelConfig{}, err
|
|
}
|
|
}
|
|
|
|
name, ok := preferencesMap["name"].(string)
|
|
if !ok {
|
|
name = filepath.Base(details.URI)
|
|
}
|
|
|
|
description, ok := preferencesMap["description"].(string)
|
|
if !ok {
|
|
description = "Imported from " + details.URI
|
|
}
|
|
|
|
preferedQuantizations, _ := preferencesMap["quantizations"].(string)
|
|
quants := []string{"q4_k_m"}
|
|
if preferedQuantizations != "" {
|
|
quants = strings.Split(preferedQuantizations, ",")
|
|
}
|
|
|
|
mmprojQuants, _ := preferencesMap["mmproj_quantizations"].(string)
|
|
mmprojQuantsList := []string{"fp16"}
|
|
if mmprojQuants != "" {
|
|
mmprojQuantsList = strings.Split(mmprojQuants, ",")
|
|
}
|
|
|
|
embeddings, _ := preferencesMap["embeddings"].(string)
|
|
|
|
// Honour drop-in replacement preferences. Only the curated names
|
|
// advertised via AdditionalBackends() are accepted; anything else
|
|
// (including "llama-cpp" itself, or an unknown value) keeps the
|
|
// default backend field so arbitrary input can't leak through. See
|
|
// the AdditionalBackends method for the canonical list.
|
|
backend := "llama-cpp"
|
|
if b, ok := preferencesMap["backend"].(string); ok {
|
|
switch b {
|
|
case "ik-llama-cpp", "turboquant":
|
|
backend = b
|
|
}
|
|
}
|
|
|
|
modelConfig := config.ModelConfig{
|
|
Name: name,
|
|
Description: description,
|
|
KnownUsecaseStrings: []string{config.UsecaseChat},
|
|
Options: []string{"use_jinja:true"},
|
|
Backend: backend,
|
|
TemplateConfig: config.TemplateConfig{
|
|
UseTokenizerTemplate: true,
|
|
},
|
|
FunctionsConfig: functions.FunctionsConfig{
|
|
GrammarConfig: functions.GrammarConfig{
|
|
NoGrammar: true,
|
|
},
|
|
AutomaticToolParsingFallback: true,
|
|
},
|
|
}
|
|
|
|
if embeddings != "" && strings.ToLower(embeddings) == "true" || strings.ToLower(embeddings) == "yes" {
|
|
trueV := true
|
|
modelConfig.Embeddings = &trueV
|
|
}
|
|
|
|
cfg := gallery.ModelConfig{
|
|
Name: name,
|
|
Description: description,
|
|
}
|
|
|
|
uri := downloader.URI(details.URI)
|
|
|
|
switch {
|
|
case uri.LooksLikeOCI():
|
|
ociName := strings.TrimPrefix(string(uri), downloader.OCIPrefix)
|
|
ociName = strings.TrimPrefix(ociName, downloader.OllamaPrefix)
|
|
ociName = strings.ReplaceAll(ociName, "/", "__")
|
|
ociName = strings.ReplaceAll(ociName, ":", "__")
|
|
cfg.Files = append(cfg.Files, gallery.File{
|
|
URI: details.URI,
|
|
Filename: ociName,
|
|
})
|
|
modelConfig.PredictionOptions = schema.PredictionOptions{
|
|
BasicModelRequest: schema.BasicModelRequest{
|
|
Model: ociName,
|
|
},
|
|
}
|
|
case uri.LooksLikeURL() && strings.HasSuffix(details.URI, ".gguf"):
|
|
// Extract filename from URL
|
|
fileName, e := uri.FilenameFromUrl()
|
|
if e != nil {
|
|
return gallery.ModelConfig{}, e
|
|
}
|
|
|
|
cfg.Files = append(cfg.Files, gallery.File{
|
|
URI: details.URI,
|
|
Filename: fileName,
|
|
})
|
|
modelConfig.PredictionOptions = schema.PredictionOptions{
|
|
BasicModelRequest: schema.BasicModelRequest{
|
|
Model: fileName,
|
|
},
|
|
}
|
|
case strings.HasSuffix(details.URI, ".gguf"):
|
|
cfg.Files = append(cfg.Files, gallery.File{
|
|
URI: details.URI,
|
|
Filename: filepath.Base(details.URI),
|
|
})
|
|
modelConfig.PredictionOptions = schema.PredictionOptions{
|
|
BasicModelRequest: schema.BasicModelRequest{
|
|
Model: filepath.Base(details.URI),
|
|
},
|
|
}
|
|
case details.HuggingFace != nil:
|
|
// Split the repo listing into mmproj vs plain GGUF files, then group
|
|
// shards so every multi-part GGUF (llama.cpp `-NNNNN-of-MMMMM.gguf`
|
|
// pattern) is treated as one logical selection candidate. The
|
|
// previous implementation picked files one at a time, so sharded
|
|
// models ended up with only the last part referenced in the gallery
|
|
// entry — useless to llama.cpp, which needs shard 1 and the whole
|
|
// set to load a split model.
|
|
var mmprojFiles, ggufFiles []hfapi.ModelFile
|
|
for _, f := range details.HuggingFace.Files {
|
|
lowerPath := strings.ToLower(f.Path)
|
|
switch {
|
|
case strings.Contains(lowerPath, "mmproj"):
|
|
mmprojFiles = append(mmprojFiles, f)
|
|
case strings.HasSuffix(lowerPath, ".gguf"):
|
|
ggufFiles = append(ggufFiles, f)
|
|
}
|
|
}
|
|
|
|
mmprojGroups := hfapi.GroupShards(mmprojFiles)
|
|
ggufGroups := hfapi.GroupShards(ggufFiles)
|
|
|
|
// Emit the model group first so cfg.Files[0] is the model — callers
|
|
// and tests rely on the model file preceding any mmproj companion.
|
|
if group := pickPreferredGroup(ggufGroups, quants); group != nil {
|
|
appendShardGroup(&cfg, *group, filepath.Join("llama-cpp", "models", name))
|
|
}
|
|
if group := pickPreferredGroup(mmprojGroups, mmprojQuantsList); group != nil {
|
|
appendShardGroup(&cfg, *group, filepath.Join("llama-cpp", "mmproj", name))
|
|
}
|
|
|
|
// Find first mmproj file and configure it in the config file
|
|
for _, file := range cfg.Files {
|
|
if !strings.Contains(strings.ToLower(file.Filename), "mmproj") {
|
|
continue
|
|
}
|
|
modelConfig.MMProj = file.Filename
|
|
break
|
|
}
|
|
|
|
// Find first non-mmproj file and configure it in the config file.
|
|
// For sharded models this is shard 1 — llama.cpp's split loader
|
|
// discovers the remaining shards by filename pattern from there.
|
|
for _, file := range cfg.Files {
|
|
if strings.Contains(strings.ToLower(file.Filename), "mmproj") {
|
|
continue
|
|
}
|
|
modelConfig.PredictionOptions = schema.PredictionOptions{
|
|
BasicModelRequest: schema.BasicModelRequest{
|
|
Model: file.Filename,
|
|
},
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
// Apply per-model-family inference parameter defaults
|
|
config.ApplyInferenceDefaults(&modelConfig, details.URI)
|
|
|
|
// Auto-detect Multi-Token Prediction heads (ggml-org/llama.cpp#22673) and
|
|
// enable speculative decoding. Mirrors the load-time hook so freshly
|
|
// imported configs already carry spec_type:draft-mtp before the model is
|
|
// ever loaded - users see it in the YAML preview rather than discovering
|
|
// it after the first start.
|
|
maybeApplyMTPDefaults(&modelConfig, details, &cfg)
|
|
|
|
data, err := yaml.Marshal(modelConfig)
|
|
if err != nil {
|
|
return gallery.ModelConfig{}, err
|
|
}
|
|
|
|
cfg.ConfigFile = string(data)
|
|
|
|
return cfg, nil
|
|
}
|
|
|
|
// pickPreferredGroup walks the preference list in priority order and returns
|
|
// the first group whose base filename contains any preference. When nothing
|
|
// matches, the last group wins — this preserves the historical "if the user
|
|
// asked for a quant we don't have, fall back to whatever's available"
|
|
// behaviour, lifted to whole shard sets.
|
|
func pickPreferredGroup(groups []hfapi.ShardGroup, prefs []string) *hfapi.ShardGroup {
|
|
if len(groups) == 0 {
|
|
return nil
|
|
}
|
|
for _, pref := range prefs {
|
|
lower := strings.ToLower(pref)
|
|
for i := range groups {
|
|
if strings.Contains(strings.ToLower(groups[i].Base), lower) {
|
|
return &groups[i]
|
|
}
|
|
}
|
|
}
|
|
return &groups[len(groups)-1]
|
|
}
|
|
|
|
// maybeApplyMTPDefaults parses the picked GGUF header (range-fetched over
|
|
// HTTP for HF/URL imports) and, if the file declares a Multi-Token Prediction
|
|
// head, appends the auto-MTP option keys to modelConfig.Options. Failures
|
|
// during the probe are non-fatal: the importer keeps the config without MTP
|
|
// so an unrelated network blip or weird header doesn't break the import.
|
|
//
|
|
// OCI/Ollama URIs are skipped because the artifact isn't directly fetchable
|
|
// as a GGUF byte stream - the load-time hook (core/config/gguf.go) covers
|
|
// those once the model is materialised on disk.
|
|
func maybeApplyMTPDefaults(modelConfig *config.ModelConfig, details Details, cfg *gallery.ModelConfig) {
|
|
probeURL := pickMTPProbeURL(details, cfg)
|
|
if probeURL == "" {
|
|
return
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer cancel()
|
|
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
xlog.Debug("[mtp-importer] panic while probing GGUF header", "uri", probeURL, "recover", r)
|
|
}
|
|
}()
|
|
|
|
f, err := gguf.ParseGGUFFileRemote(ctx, probeURL)
|
|
if err != nil {
|
|
xlog.Debug("[mtp-importer] failed to read remote GGUF header for MTP detection", "uri", probeURL, "error", err)
|
|
return
|
|
}
|
|
|
|
n, ok := config.HasEmbeddedMTPHead(f)
|
|
if !ok {
|
|
return
|
|
}
|
|
config.ApplyMTPDefaults(modelConfig, n)
|
|
}
|
|
|
|
// pickMTPProbeURL returns an HTTP(S) URL pointing at the main (non-mmproj)
|
|
// GGUF shard that should be inspected for an MTP head, or "" when no
|
|
// suitable URL is available. Custom URI schemes (`huggingface://`,
|
|
// `ollama://`, etc.) are run through `downloader.URI.ResolveURL` so the
|
|
// resulting URL is something `gguf.ParseGGUFFileRemote` can actually open.
|
|
// OCI/Ollama URIs are skipped because the artifact is not directly
|
|
// streamable as a GGUF byte range.
|
|
func pickMTPProbeURL(details Details, cfg *gallery.ModelConfig) string {
|
|
uri := downloader.URI(details.URI)
|
|
|
|
if uri.LooksLikeOCI() {
|
|
return ""
|
|
}
|
|
|
|
if strings.HasSuffix(strings.ToLower(details.URI), ".gguf") {
|
|
return resolveHTTPProbe(details.URI)
|
|
}
|
|
|
|
for _, f := range cfg.Files {
|
|
lower := strings.ToLower(f.Filename)
|
|
if strings.Contains(lower, "mmproj") {
|
|
continue
|
|
}
|
|
if !strings.HasSuffix(lower, ".gguf") {
|
|
continue
|
|
}
|
|
return resolveHTTPProbe(f.URI)
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// resolveHTTPProbe resolves an importer-side URI to the HTTP(S) URL that
|
|
// `gguf.ParseGGUFFileRemote` can range-fetch. Returns "" if the URI can't
|
|
// be reduced to an HTTP(S) endpoint (e.g. local path, unsupported scheme).
|
|
func resolveHTTPProbe(uri string) string {
|
|
resolved := downloader.URI(uri).ResolveURL()
|
|
if downloader.URI(resolved).LooksLikeHTTPURL() {
|
|
return resolved
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// appendShardGroup copies every shard of group into cfg.Files under dest,
|
|
// skipping any entry whose target filename is already present so repeated
|
|
// calls (e.g. the rare case of mmproj + model picking the same group)
|
|
// don't produce duplicates.
|
|
func appendShardGroup(cfg *gallery.ModelConfig, group hfapi.ShardGroup, dest string) {
|
|
for _, f := range group.Files {
|
|
target := filepath.Join(dest, filepath.Base(f.Path))
|
|
duplicate := false
|
|
for _, existing := range cfg.Files {
|
|
if existing.Filename == target {
|
|
duplicate = true
|
|
break
|
|
}
|
|
}
|
|
if duplicate {
|
|
continue
|
|
}
|
|
cfg.Files = append(cfg.Files, gallery.File{
|
|
URI: f.URL,
|
|
Filename: target,
|
|
SHA256: f.SHA256,
|
|
})
|
|
}
|
|
}
|