feat(importers): add vibevoice-cpp importer for GGUF bundles (#9685)

Routes mudler/vibevoice.cpp-models and similar repos to the vibevoice-cpp backend. Detects via repo name ("vibevoice.cpp"/"vibevoice-cpp"), file listing (vibevoice-*.gguf + tokenizer.gguf), or preferences.backend override. Defaults to the realtime TTS model; preferences.usecase=asr selects the ASR/diarization variant. Bundles the required tokenizer.gguf and (for TTS) a voice prompt, emitting the Options[] entries the backend expects. Registered ahead of VibeVoiceImporter so the C++ bundles aren't swallowed by the older Python-backend substring match. Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Write] [Bash] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-16 20:52:08 -04:00 · 2026-05-06 13:33:10 +02:00
parent a8d7d37a3c
commit 6d56bf98fe
3 changed files with 620 additions and 0 deletions
--- a/core/gallery/importers/importers.go
+++ b/core/gallery/importers/importers.go
@@ -125,6 +125,10 @@ var defaultImporters = []Importer{
 	&KittenTTSImporter{},
 	&NeuTTSImporter{},
 	&ChatterboxImporter{},
+	// VibeVoiceCppImporter must precede VibeVoiceImporter — the older
+	// Python-backend importer matches any repo name containing "vibevoice"
+	// and would otherwise swallow the C++ port's GGUF bundles.
+	&VibeVoiceCppImporter{},
 	&VibeVoiceImporter{},
 	&CoquiImporter{},
 	// Image/Video (Batch 3)
--- a/core/gallery/importers/vibevoice-cpp.go
+++ b/core/gallery/importers/vibevoice-cpp.go
@@ -0,0 +1,355 @@
+package importers
+
+import (
+	"encoding/json"
+	"path/filepath"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/gallery"
+	"github.com/mudler/LocalAI/core/schema"
+	hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
+	"go.yaml.in/yaml/v2"
+)
+
+var _ Importer = &VibeVoiceCppImporter{}
+
+// VibeVoiceCppImporter recognises the GGUF bundle that the vibevoice.cpp
+// backend consumes — primary model file (vibevoice-realtime-*.gguf for TTS or
+// vibevoice-asr-*.gguf for ASR), a sibling tokenizer.gguf (always required),
+// and optional voice-*.gguf prompts for TTS voice cloning. Detection fires on
+// the HF repo name containing "vibevoice.cpp"/"vibevoice-cpp", or on the
+// presence of a vibevoice-*.gguf + tokenizer.gguf pair. preferences.backend
+// ="vibevoice-cpp" forces the importer regardless of artefacts.
+//
+// Role pick: defaults to TTS (the realtime model is small and the common
+// case). preferences.usecase="asr" routes to the ASR/diarization model. If a
+// repo only ships one of the two roles, that role wins automatically.
+//
+// MUST be registered ahead of VibeVoiceImporter — the older Python-backed
+// importer matches any repo with "vibevoice" in the name, which would
+// otherwise swallow the C++ bundle.
+type VibeVoiceCppImporter struct{}
+
+func (i *VibeVoiceCppImporter) Name() string      { return "vibevoice-cpp" }
+func (i *VibeVoiceCppImporter) Modality() string  { return "tts" }
+func (i *VibeVoiceCppImporter) AutoDetects() bool { return true }
+
+func (i *VibeVoiceCppImporter) Match(details Details) bool {
+	preferencesMap := unmarshalPreferences(details.Preferences)
+	if b, ok := preferencesMap["backend"].(string); ok && b == "vibevoice-cpp" {
+		return true
+	}
+
+	// Repo-name signal: anything carrying "vibevoice.cpp" or "vibevoice-cpp"
+	// — the canonical naming for the C++ port bundles.
+	repoSignals := []string{strings.ToLower(repoNameOnly(details))}
+	if _, repo, ok := HFOwnerRepoFromURI(details.URI); ok {
+		repoSignals = append(repoSignals, strings.ToLower(repo))
+	}
+	for _, s := range repoSignals {
+		if strings.Contains(s, "vibevoice.cpp") || strings.Contains(s, "vibevoice-cpp") {
+			return true
+		}
+	}
+
+	// File-listing signal: a vibevoice-*.gguf primary + tokenizer.gguf is
+	// only what the C++ backend ships — the Python VibeVoice fork distributes
+	// safetensors, never GGUF.
+	if details.HuggingFace != nil &&
+		HasFile(details.HuggingFace.Files, "tokenizer.gguf") &&
+		hasVibeVoiceGGUF(details.HuggingFace.Files) {
+		return true
+	}
+
+	return false
+}
+
+func (i *VibeVoiceCppImporter) Import(details Details) (gallery.ModelConfig, error) {
+	preferencesMap := unmarshalPreferences(details.Preferences)
+
+	name, ok := preferencesMap["name"].(string)
+	if !ok {
+		name = filepath.Base(details.URI)
+	}
+
+	description, ok := preferencesMap["description"].(string)
+	if !ok {
+		description = "Imported from " + details.URI
+	}
+
+	// Quant preference — default order matches what mudler/vibevoice.cpp-models
+	// ships today. Same comma-separated convention as whisper / llama-cpp.
+	quants := []string{"q8_0", "q4_k", "q5_k", "q4_0"}
+	if preferred, ok := preferencesMap["quantizations"].(string); ok && preferred != "" {
+		quants = strings.Split(preferred, ",")
+	}
+
+	usecase := strings.ToLower(stringPref(preferencesMap, "usecase"))
+
+	cfg := gallery.ModelConfig{
+		Name:        name,
+		Description: description,
+	}
+
+	modelConfig := config.ModelConfig{
+		Name:        name,
+		Description: description,
+		Backend:     "vibevoice-cpp",
+	}
+
+	// Without HF metadata we can only emit a skeleton config — the user must
+	// edit it post-import to point at real files. Mirrors whisper's bare-URI
+	// fallback so preference-only invocations still produce something usable.
+	if details.HuggingFace == nil {
+		modelConfig.PredictionOptions = schema.PredictionOptions{
+			BasicModelRequest: schema.BasicModelRequest{Model: filepath.Base(details.URI)},
+		}
+		if usecase == "asr" {
+			modelConfig.KnownUsecaseStrings = []string{"transcript"}
+			modelConfig.Options = []string{"type=asr", "tokenizer=tokenizer.gguf"}
+		} else {
+			modelConfig.KnownUsecaseStrings = []string{"tts"}
+			modelConfig.Options = []string{"tokenizer=tokenizer.gguf"}
+		}
+		data, err := yaml.Marshal(modelConfig)
+		if err != nil {
+			return gallery.ModelConfig{}, err
+		}
+		cfg.ConfigFile = string(data)
+		return cfg, nil
+	}
+
+	files := details.HuggingFace.Files
+	ttsFiles := filterByPrefix(files, "vibevoice-realtime-")
+	asrFiles := filterByPrefix(files, "vibevoice-asr-")
+
+	// Auto-pick role when the repo only ships one. Explicit usecase wins.
+	role := usecase
+	if role == "" {
+		switch {
+		case len(ttsFiles) > 0 && len(asrFiles) == 0:
+			role = "tts"
+		case len(asrFiles) > 0 && len(ttsFiles) == 0:
+			role = "asr"
+		default:
+			role = "tts" // default: realtime TTS is the smaller, more common case
+		}
+	}
+
+	// Layout under <models>/vibevoice-cpp/<name>/ — same pattern as whisper's
+	// nesting so multiple imports of the same upstream repo (with different
+	// quants) don't collide on disk. Options[] paths are emitted relative to
+	// opts.ModelPath, which the backend resolves against the LocalAI models
+	// root in govibevoicecpp.go:resolvePath.
+	relDir := filepath.Join("vibevoice-cpp", name)
+
+	var primary []hfapi.ModelFile
+	switch role {
+	case "asr", "transcript", "stt", "speech-to-text":
+		primary = asrFiles
+		modelConfig.KnownUsecaseStrings = []string{"transcript"}
+	default:
+		primary = ttsFiles
+		modelConfig.KnownUsecaseStrings = []string{"tts"}
+	}
+	// If the requested role has no matching files, fall back to any
+	// vibevoice-*.gguf so the import still produces something runnable.
+	if len(primary) == 0 {
+		primary = filterByPrefix(files, "vibevoice-")
+	}
+
+	chosen, ok := pickPreferredGGUFFile(primary, quants)
+	if !ok {
+		// Nothing to download. Emit the skeleton — same shape as the
+		// no-HF-metadata branch above, just with a sensible default name.
+		modelConfig.PredictionOptions = schema.PredictionOptions{
+			BasicModelRequest: schema.BasicModelRequest{Model: name + ".gguf"},
+		}
+		if role == "asr" {
+			modelConfig.Options = []string{"type=asr", "tokenizer=" + filepath.Join(relDir, "tokenizer.gguf")}
+		} else {
+			modelConfig.Options = []string{"tokenizer=" + filepath.Join(relDir, "tokenizer.gguf")}
+		}
+		data, err := yaml.Marshal(modelConfig)
+		if err != nil {
+			return gallery.ModelConfig{}, err
+		}
+		cfg.ConfigFile = string(data)
+		return cfg, nil
+	}
+
+	modelTarget := filepath.Join(relDir, filepath.Base(chosen.Path))
+	cfg.Files = append(cfg.Files, gallery.File{
+		URI:      chosen.URL,
+		Filename: modelTarget,
+		SHA256:   chosen.SHA256,
+	})
+	modelConfig.PredictionOptions = schema.PredictionOptions{
+		BasicModelRequest: schema.BasicModelRequest{Model: modelTarget},
+	}
+
+	// tokenizer.gguf is mandatory — Load() rejects without it. Always pull
+	// it when the repo provides one (every official vibevoice.cpp bundle does).
+	options := []string{}
+	if role == "asr" {
+		options = append(options, "type=asr")
+	}
+	if tok, ok := findFile(files, "tokenizer.gguf"); ok {
+		tokTarget := filepath.Join(relDir, "tokenizer.gguf")
+		cfg.Files = append(cfg.Files, gallery.File{
+			URI:      tok.URL,
+			Filename: tokTarget,
+			SHA256:   tok.SHA256,
+		})
+		options = append(options, "tokenizer="+tokTarget)
+	}
+
+	// For TTS, ship the first voice-*.gguf as a default — the backend needs
+	// a reference voice to clone from. ASR doesn't use voice prompts.
+	if role != "asr" {
+		if voice, ok := pickVoicePrompt(files, stringPref(preferencesMap, "voice")); ok {
+			voiceTarget := filepath.Join(relDir, filepath.Base(voice.Path))
+			cfg.Files = append(cfg.Files, gallery.File{
+				URI:      voice.URL,
+				Filename: voiceTarget,
+				SHA256:   voice.SHA256,
+			})
+			options = append(options, "voice="+voiceTarget)
+		}
+	}
+	modelConfig.Options = options
+
+	data, err := yaml.Marshal(modelConfig)
+	if err != nil {
+		return gallery.ModelConfig{}, err
+	}
+	cfg.ConfigFile = string(data)
+	return cfg, nil
+}
+
+// hasVibeVoiceGGUF returns true when any file matches "vibevoice-*.gguf"
+// (case-insensitive). Narrow on purpose — third-party GGUF mirrors that
+// re-pack the model under different filenames will be missed, but those
+// users can pass preferences.backend="vibevoice-cpp" to force the importer.
+func hasVibeVoiceGGUF(files []hfapi.ModelFile) bool {
+	for _, f := range files {
+		name := strings.ToLower(filepath.Base(f.Path))
+		if strings.HasPrefix(name, "vibevoice-") && strings.HasSuffix(name, ".gguf") {
+			return true
+		}
+	}
+	return false
+}
+
+// filterByPrefix returns every file whose basename starts with prefix and
+// ends in .gguf (case-insensitive on the suffix, exact on the prefix).
+func filterByPrefix(files []hfapi.ModelFile, prefix string) []hfapi.ModelFile {
+	var out []hfapi.ModelFile
+	for _, f := range files {
+		base := filepath.Base(f.Path)
+		if !strings.HasPrefix(base, prefix) {
+			continue
+		}
+		if !strings.HasSuffix(strings.ToLower(base), ".gguf") {
+			continue
+		}
+		out = append(out, f)
+	}
+	return out
+}
+
+// findFile is HasFile's lookup-returning sibling. Returns the first file
+// whose basename equals name (exact match), or false when none exists.
+func findFile(files []hfapi.ModelFile, name string) (hfapi.ModelFile, bool) {
+	for _, f := range files {
+		if filepath.Base(f.Path) == name {
+			return f, true
+		}
+	}
+	return hfapi.ModelFile{}, false
+}
+
+// pickPreferredGGUFFile mirrors pickPreferredGGMLFile but operates on .gguf
+// files: walks prefs in order, returns the first file whose basename contains
+// any preference token (case-insensitive). On no match, falls back to the
+// last file so a missing quant still yields a runnable import.
+func pickPreferredGGUFFile(files []hfapi.ModelFile, prefs []string) (hfapi.ModelFile, bool) {
+	if len(files) == 0 {
+		return hfapi.ModelFile{}, false
+	}
+	for _, pref := range prefs {
+		lower := strings.ToLower(strings.TrimSpace(pref))
+		if lower == "" {
+			continue
+		}
+		for _, f := range files {
+			if strings.Contains(strings.ToLower(filepath.Base(f.Path)), lower) {
+				return f, true
+			}
+		}
+	}
+	return files[len(files)-1], true
+}
+
+// pickVoicePrompt selects a voice-*.gguf to bundle with a TTS import.
+// Honours an explicit preferences.voice substring (e.g. "Emma" picks
+// voice-en-Emma.gguf); otherwise returns the first voice file in listing
+// order so the choice is stable across imports of the same repo.
+func pickVoicePrompt(files []hfapi.ModelFile, hint string) (hfapi.ModelFile, bool) {
+	hint = strings.ToLower(strings.TrimSpace(hint))
+	var voices []hfapi.ModelFile
+	for _, f := range files {
+		base := strings.ToLower(filepath.Base(f.Path))
+		if strings.HasPrefix(base, "voice-") && strings.HasSuffix(base, ".gguf") {
+			voices = append(voices, f)
+		}
+	}
+	if len(voices) == 0 {
+		return hfapi.ModelFile{}, false
+	}
+	if hint != "" {
+		for _, v := range voices {
+			if strings.Contains(strings.ToLower(filepath.Base(v.Path)), hint) {
+				return v, true
+			}
+		}
+	}
+	return voices[0], true
+}
+
+// repoNameOnly extracts the repo basename (everything after the last "/")
+// from HF metadata or, failing that, the URI. Empty when neither is set.
+func repoNameOnly(details Details) string {
+	if details.HuggingFace != nil {
+		id := details.HuggingFace.ModelID
+		if idx := strings.Index(id, "/"); idx >= 0 {
+			return id[idx+1:]
+		}
+		return id
+	}
+	return ""
+}
+
+// unmarshalPreferences decodes details.Preferences into a generic map. Returns
+// an empty map (never nil) on any failure so callers can index without nil
+// checks. Bad JSON is silently ignored — every importer here treats
+// preferences as best-effort hints.
+func unmarshalPreferences(raw json.RawMessage) map[string]any {
+	out := map[string]any{}
+	b, err := raw.MarshalJSON()
+	if err != nil || len(b) == 0 {
+		return out
+	}
+	_ = json.Unmarshal(b, &out)
+	return out
+}
+
+// stringPref reads a string preference by key, returning "" when missing or
+// of the wrong type.
+func stringPref(m map[string]any, key string) string {
+	if v, ok := m[key].(string); ok {
+		return v
+	}
+	return ""
+}
--- a/core/gallery/importers/vibevoice-cpp_test.go
+++ b/core/gallery/importers/vibevoice-cpp_test.go
@@ -0,0 +1,261 @@
+package importers_test
+
+import (
+	"encoding/json"
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/gallery/importers"
+	hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("VibeVoiceCppImporter", func() {
+	Context("Importer interface metadata", func() {
+		It("exposes name/modality/autodetect", func() {
+			imp := &importers.VibeVoiceCppImporter{}
+			Expect(imp.Name()).To(Equal("vibevoice-cpp"))
+			Expect(imp.Modality()).To(Equal("tts"))
+			Expect(imp.AutoDetects()).To(BeTrue())
+		})
+	})
+
+	Context("preference override", func() {
+		It("honours preferences.backend=vibevoice-cpp for arbitrary URIs", func() {
+			uri := "https://example.com/some-unrelated-model"
+			preferences := json.RawMessage(`{"backend": "vibevoice-cpp"}`)
+
+			modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
+
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: vibevoice-cpp"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("tokenizer=tokenizer.gguf"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("tts"))
+		})
+
+		It("emits an ASR skeleton when usecase=asr is requested with no HF metadata", func() {
+			uri := "https://example.com/some-unrelated-model"
+			preferences := json.RawMessage(`{"backend": "vibevoice-cpp", "usecase": "asr"}`)
+
+			modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
+
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: vibevoice-cpp"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("type=asr"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("transcript"))
+		})
+	})
+
+	// Live HF call against the canonical bundle. Marked broad: it shouldn't
+	// be brittle to upstream adding more quants/voices — we only assert that
+	// the realtime TTS path was picked and the tokenizer was bundled.
+	Context("detection from HuggingFace: mudler/vibevoice.cpp-models", func() {
+		const uri = "https://huggingface.co/mudler/vibevoice.cpp-models"
+
+		It("routes to vibevoice-cpp, picks the realtime TTS GGUF and bundles tokenizer + voice prompt", func() {
+			modelConfig, err := importers.DiscoverModelConfig(uri, json.RawMessage(`{}`))
+
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: vibevoice-cpp"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("tts"))
+
+			// Primary model must be the realtime variant (TTS default).
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("vibevoice-realtime-"))
+
+			// Tokenizer is mandatory and must show up both as a downloaded
+			// file and as a tokenizer= option entry. The path is rooted
+			// under vibevoice-cpp/<name>/ so multiple imports don't collide.
+			var sawTokenizerFile, sawModelFile, sawVoiceFile bool
+			for _, f := range modelConfig.Files {
+				if f.Filename == "" {
+					continue
+				}
+				if filepathBase(f.Filename) == "tokenizer.gguf" {
+					sawTokenizerFile = true
+				}
+				if startsWith(filepathBase(f.Filename), "vibevoice-realtime-") {
+					sawModelFile = true
+				}
+				if startsWith(filepathBase(f.Filename), "voice-") {
+					sawVoiceFile = true
+				}
+			}
+			Expect(sawTokenizerFile).To(BeTrue(), fmt.Sprintf("expected tokenizer.gguf in Files, got: %+v", modelConfig.Files))
+			Expect(sawModelFile).To(BeTrue(), fmt.Sprintf("expected a vibevoice-realtime-*.gguf in Files, got: %+v", modelConfig.Files))
+			Expect(sawVoiceFile).To(BeTrue(), fmt.Sprintf("expected a voice-*.gguf in Files, got: %+v", modelConfig.Files))
+
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("tokenizer=vibevoice-cpp/"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("voice=vibevoice-cpp/"))
+		})
+
+		It("routes to ASR + diarization when preferences.usecase=asr", func() {
+			modelConfig, err := importers.DiscoverModelConfig(uri, json.RawMessage(`{"usecase":"asr"}`))
+
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: vibevoice-cpp"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("transcript"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("type=asr"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("vibevoice-asr-"))
+			// ASR must NOT bundle a voice prompt — the backend ignores it
+			// for transcription and we don't want gratuitous downloads.
+			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("voice="))
+		})
+	})
+
+	// Offline fixtures — assert the end-to-end shape of what the importer
+	// emits without depending on HF availability or upstream file lists.
+	Context("Import from HuggingFace file listing (offline)", func() {
+		const repoBase = "https://huggingface.co/mudler/vibevoice.cpp-models/resolve/main/"
+
+		hfFile := func(path, sha string) hfapi.ModelFile {
+			return hfapi.ModelFile{
+				Path:   path,
+				SHA256: sha,
+				URL:    repoBase + path,
+			}
+		}
+
+		withHF := func(preferences string, files ...hfapi.ModelFile) importers.Details {
+			d := importers.Details{
+				URI: "https://huggingface.co/mudler/vibevoice.cpp-models",
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID: "mudler/vibevoice.cpp-models",
+					Files:   files,
+				},
+			}
+			if preferences != "" {
+				d.Preferences = json.RawMessage(preferences)
+			}
+			return d
+		}
+
+		It("defaults to TTS realtime + tokenizer + first voice, nested under vibevoice-cpp/<name>/", func() {
+			imp := &importers.VibeVoiceCppImporter{}
+			details := withHF(`{"name":"vibe"}`,
+				hfFile("vibevoice-realtime-0.5B-q8_0.gguf", "aaa"),
+				hfFile("vibevoice-asr-q4_k.gguf", "bbb"),
+				hfFile("tokenizer.gguf", "ccc"),
+				hfFile("voice-en-Carter_man.gguf", "ddd"),
+				hfFile("voice-en-Emma.gguf", "eee"),
+				hfFile("README.md", ""),
+			)
+
+			modelConfig, err := imp.Import(details)
+			Expect(err).ToNot(HaveOccurred())
+
+			Expect(modelConfig.Files).To(HaveLen(3))
+			byName := map[string]string{}
+			for _, f := range modelConfig.Files {
+				byName[filepathBase(f.Filename)] = f.Filename
+			}
+			Expect(byName).To(HaveKey("vibevoice-realtime-0.5B-q8_0.gguf"))
+			Expect(byName).To(HaveKey("tokenizer.gguf"))
+			Expect(byName).To(HaveKey("voice-en-Carter_man.gguf"))
+			Expect(byName["tokenizer.gguf"]).To(Equal("vibevoice-cpp/vibe/tokenizer.gguf"))
+
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: vibevoice-cpp"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: vibevoice-cpp/vibe/vibevoice-realtime-0.5B-q8_0.gguf"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("- tokenizer=vibevoice-cpp/vibe/tokenizer.gguf"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("- voice=vibevoice-cpp/vibe/voice-en-Carter_man.gguf"))
+			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("type=asr"))
+		})
+
+		It("routes to ASR when preferences.usecase=asr and skips voice prompts", func() {
+			imp := &importers.VibeVoiceCppImporter{}
+			details := withHF(`{"name":"vibe-asr","usecase":"asr"}`,
+				hfFile("vibevoice-realtime-0.5B-q8_0.gguf", "aaa"),
+				hfFile("vibevoice-asr-q4_k.gguf", "bbb"),
+				hfFile("vibevoice-asr-q8_0.gguf", "fff"),
+				hfFile("tokenizer.gguf", "ccc"),
+				hfFile("voice-en-Emma.gguf", "ddd"),
+			)
+
+			modelConfig, err := imp.Import(details)
+			Expect(err).ToNot(HaveOccurred())
+
+			Expect(modelConfig.Files).To(HaveLen(2))
+			byName := map[string]string{}
+			for _, f := range modelConfig.Files {
+				byName[filepathBase(f.Filename)] = f.Filename
+			}
+			// Default quant order picks q8_0 over q4_k.
+			Expect(byName).To(HaveKey("vibevoice-asr-q8_0.gguf"))
+			Expect(byName).To(HaveKey("tokenizer.gguf"))
+
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: vibevoice-cpp/vibe-asr/vibevoice-asr-q8_0.gguf"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("- type=asr"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("- tokenizer=vibevoice-cpp/vibe-asr/tokenizer.gguf"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("transcript"))
+			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("voice="))
+		})
+
+		It("honours preferences.quantizations to pick a specific quant", func() {
+			imp := &importers.VibeVoiceCppImporter{}
+			details := withHF(`{"name":"vibe","quantizations":"q4_k"}`,
+				hfFile("vibevoice-asr-q4_k.gguf", "aaa"),
+				hfFile("vibevoice-asr-q8_0.gguf", "bbb"),
+				hfFile("tokenizer.gguf", "ccc"),
+			)
+
+			modelConfig, err := imp.Import(details)
+			Expect(err).ToNot(HaveOccurred())
+
+			// Repo only ships ASR — auto-routes to asr, picks the requested
+			// quant, emits type=asr automatically.
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: vibevoice-cpp/vibe/vibevoice-asr-q4_k.gguf"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("- type=asr"))
+		})
+
+		It("honours preferences.voice to pick a specific voice prompt", func() {
+			imp := &importers.VibeVoiceCppImporter{}
+			details := withHF(`{"name":"vibe","voice":"Emma"}`,
+				hfFile("vibevoice-realtime-0.5B-q8_0.gguf", "aaa"),
+				hfFile("tokenizer.gguf", "bbb"),
+				hfFile("voice-en-Carter_man.gguf", "ccc"),
+				hfFile("voice-en-Emma.gguf", "ddd"),
+			)
+
+			modelConfig, err := imp.Import(details)
+			Expect(err).ToNot(HaveOccurred())
+
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("- voice=vibevoice-cpp/vibe/voice-en-Emma.gguf"))
+			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("voice-en-Carter_man"))
+		})
+	})
+
+	// Make sure we don't regress the existing Python-backend importer for
+	// repos that don't carry the C++ port's signal (e.g. microsoft/VibeVoice-1.5B).
+	Context("non-cpp vibevoice repos still route to the Python importer", func() {
+		It("does not claim microsoft/VibeVoice-1.5B (no GGUF / no .cpp suffix)", func() {
+			imp := &importers.VibeVoiceCppImporter{}
+			details := importers.Details{
+				URI: "https://huggingface.co/microsoft/VibeVoice-1.5B",
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID: "microsoft/VibeVoice-1.5B",
+					Files: []hfapi.ModelFile{
+						{Path: "config.json"},
+						{Path: "model.safetensors"},
+					},
+				},
+				Preferences: json.RawMessage(`{}`),
+			}
+			Expect(imp.Match(details)).To(BeFalse())
+		})
+	})
+})
+
+// filepathBase / startsWith are tiny helpers so the test file stays
+// stdlib-only and doesn't pull in path/filepath + strings just for the
+// expected-shape assertions.
+func filepathBase(p string) string {
+	for i := len(p) - 1; i >= 0; i-- {
+		if p[i] == '/' {
+			return p[i+1:]
+		}
+	}
+	return p
+}
+
+func startsWith(s, prefix string) bool {
+	return len(s) >= len(prefix) && s[:len(prefix)] == prefix
+}