mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-16 20:52:08 -04:00
feat(importers): add vibevoice-cpp importer for GGUF bundles (#9685)
Routes mudler/vibevoice.cpp-models and similar repos to the vibevoice-cpp
backend. Detects via repo name ("vibevoice.cpp"/"vibevoice-cpp"), file
listing (vibevoice-*.gguf + tokenizer.gguf), or preferences.backend
override. Defaults to the realtime TTS model; preferences.usecase=asr
selects the ASR/diarization variant. Bundles the required tokenizer.gguf
and (for TTS) a voice prompt, emitting the Options[] entries the backend
expects. Registered ahead of VibeVoiceImporter so the C++ bundles aren't
swallowed by the older Python-backend substring match.
Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Write] [Bash]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -125,6 +125,10 @@ var defaultImporters = []Importer{
|
||||
&KittenTTSImporter{},
|
||||
&NeuTTSImporter{},
|
||||
&ChatterboxImporter{},
|
||||
// VibeVoiceCppImporter must precede VibeVoiceImporter — the older
|
||||
// Python-backend importer matches any repo name containing "vibevoice"
|
||||
// and would otherwise swallow the C++ port's GGUF bundles.
|
||||
&VibeVoiceCppImporter{},
|
||||
&VibeVoiceImporter{},
|
||||
&CoquiImporter{},
|
||||
// Image/Video (Batch 3)
|
||||
|
||||
355
core/gallery/importers/vibevoice-cpp.go
Normal file
355
core/gallery/importers/vibevoice-cpp.go
Normal file
@@ -0,0 +1,355 @@
|
||||
package importers
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/gallery"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
|
||||
"go.yaml.in/yaml/v2"
|
||||
)
|
||||
|
||||
var _ Importer = &VibeVoiceCppImporter{}
|
||||
|
||||
// VibeVoiceCppImporter recognises the GGUF bundle that the vibevoice.cpp
|
||||
// backend consumes — primary model file (vibevoice-realtime-*.gguf for TTS or
|
||||
// vibevoice-asr-*.gguf for ASR), a sibling tokenizer.gguf (always required),
|
||||
// and optional voice-*.gguf prompts for TTS voice cloning. Detection fires on
|
||||
// the HF repo name containing "vibevoice.cpp"/"vibevoice-cpp", or on the
|
||||
// presence of a vibevoice-*.gguf + tokenizer.gguf pair. preferences.backend
|
||||
// ="vibevoice-cpp" forces the importer regardless of artefacts.
|
||||
//
|
||||
// Role pick: defaults to TTS (the realtime model is small and the common
|
||||
// case). preferences.usecase="asr" routes to the ASR/diarization model. If a
|
||||
// repo only ships one of the two roles, that role wins automatically.
|
||||
//
|
||||
// MUST be registered ahead of VibeVoiceImporter — the older Python-backed
|
||||
// importer matches any repo with "vibevoice" in the name, which would
|
||||
// otherwise swallow the C++ bundle.
|
||||
type VibeVoiceCppImporter struct{}
|
||||
|
||||
func (i *VibeVoiceCppImporter) Name() string { return "vibevoice-cpp" }
|
||||
func (i *VibeVoiceCppImporter) Modality() string { return "tts" }
|
||||
func (i *VibeVoiceCppImporter) AutoDetects() bool { return true }
|
||||
|
||||
func (i *VibeVoiceCppImporter) Match(details Details) bool {
|
||||
preferencesMap := unmarshalPreferences(details.Preferences)
|
||||
if b, ok := preferencesMap["backend"].(string); ok && b == "vibevoice-cpp" {
|
||||
return true
|
||||
}
|
||||
|
||||
// Repo-name signal: anything carrying "vibevoice.cpp" or "vibevoice-cpp"
|
||||
// — the canonical naming for the C++ port bundles.
|
||||
repoSignals := []string{strings.ToLower(repoNameOnly(details))}
|
||||
if _, repo, ok := HFOwnerRepoFromURI(details.URI); ok {
|
||||
repoSignals = append(repoSignals, strings.ToLower(repo))
|
||||
}
|
||||
for _, s := range repoSignals {
|
||||
if strings.Contains(s, "vibevoice.cpp") || strings.Contains(s, "vibevoice-cpp") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// File-listing signal: a vibevoice-*.gguf primary + tokenizer.gguf is
|
||||
// only what the C++ backend ships — the Python VibeVoice fork distributes
|
||||
// safetensors, never GGUF.
|
||||
if details.HuggingFace != nil &&
|
||||
HasFile(details.HuggingFace.Files, "tokenizer.gguf") &&
|
||||
hasVibeVoiceGGUF(details.HuggingFace.Files) {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (i *VibeVoiceCppImporter) Import(details Details) (gallery.ModelConfig, error) {
|
||||
preferencesMap := unmarshalPreferences(details.Preferences)
|
||||
|
||||
name, ok := preferencesMap["name"].(string)
|
||||
if !ok {
|
||||
name = filepath.Base(details.URI)
|
||||
}
|
||||
|
||||
description, ok := preferencesMap["description"].(string)
|
||||
if !ok {
|
||||
description = "Imported from " + details.URI
|
||||
}
|
||||
|
||||
// Quant preference — default order matches what mudler/vibevoice.cpp-models
|
||||
// ships today. Same comma-separated convention as whisper / llama-cpp.
|
||||
quants := []string{"q8_0", "q4_k", "q5_k", "q4_0"}
|
||||
if preferred, ok := preferencesMap["quantizations"].(string); ok && preferred != "" {
|
||||
quants = strings.Split(preferred, ",")
|
||||
}
|
||||
|
||||
usecase := strings.ToLower(stringPref(preferencesMap, "usecase"))
|
||||
|
||||
cfg := gallery.ModelConfig{
|
||||
Name: name,
|
||||
Description: description,
|
||||
}
|
||||
|
||||
modelConfig := config.ModelConfig{
|
||||
Name: name,
|
||||
Description: description,
|
||||
Backend: "vibevoice-cpp",
|
||||
}
|
||||
|
||||
// Without HF metadata we can only emit a skeleton config — the user must
|
||||
// edit it post-import to point at real files. Mirrors whisper's bare-URI
|
||||
// fallback so preference-only invocations still produce something usable.
|
||||
if details.HuggingFace == nil {
|
||||
modelConfig.PredictionOptions = schema.PredictionOptions{
|
||||
BasicModelRequest: schema.BasicModelRequest{Model: filepath.Base(details.URI)},
|
||||
}
|
||||
if usecase == "asr" {
|
||||
modelConfig.KnownUsecaseStrings = []string{"transcript"}
|
||||
modelConfig.Options = []string{"type=asr", "tokenizer=tokenizer.gguf"}
|
||||
} else {
|
||||
modelConfig.KnownUsecaseStrings = []string{"tts"}
|
||||
modelConfig.Options = []string{"tokenizer=tokenizer.gguf"}
|
||||
}
|
||||
data, err := yaml.Marshal(modelConfig)
|
||||
if err != nil {
|
||||
return gallery.ModelConfig{}, err
|
||||
}
|
||||
cfg.ConfigFile = string(data)
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
files := details.HuggingFace.Files
|
||||
ttsFiles := filterByPrefix(files, "vibevoice-realtime-")
|
||||
asrFiles := filterByPrefix(files, "vibevoice-asr-")
|
||||
|
||||
// Auto-pick role when the repo only ships one. Explicit usecase wins.
|
||||
role := usecase
|
||||
if role == "" {
|
||||
switch {
|
||||
case len(ttsFiles) > 0 && len(asrFiles) == 0:
|
||||
role = "tts"
|
||||
case len(asrFiles) > 0 && len(ttsFiles) == 0:
|
||||
role = "asr"
|
||||
default:
|
||||
role = "tts" // default: realtime TTS is the smaller, more common case
|
||||
}
|
||||
}
|
||||
|
||||
// Layout under <models>/vibevoice-cpp/<name>/ — same pattern as whisper's
|
||||
// nesting so multiple imports of the same upstream repo (with different
|
||||
// quants) don't collide on disk. Options[] paths are emitted relative to
|
||||
// opts.ModelPath, which the backend resolves against the LocalAI models
|
||||
// root in govibevoicecpp.go:resolvePath.
|
||||
relDir := filepath.Join("vibevoice-cpp", name)
|
||||
|
||||
var primary []hfapi.ModelFile
|
||||
switch role {
|
||||
case "asr", "transcript", "stt", "speech-to-text":
|
||||
primary = asrFiles
|
||||
modelConfig.KnownUsecaseStrings = []string{"transcript"}
|
||||
default:
|
||||
primary = ttsFiles
|
||||
modelConfig.KnownUsecaseStrings = []string{"tts"}
|
||||
}
|
||||
// If the requested role has no matching files, fall back to any
|
||||
// vibevoice-*.gguf so the import still produces something runnable.
|
||||
if len(primary) == 0 {
|
||||
primary = filterByPrefix(files, "vibevoice-")
|
||||
}
|
||||
|
||||
chosen, ok := pickPreferredGGUFFile(primary, quants)
|
||||
if !ok {
|
||||
// Nothing to download. Emit the skeleton — same shape as the
|
||||
// no-HF-metadata branch above, just with a sensible default name.
|
||||
modelConfig.PredictionOptions = schema.PredictionOptions{
|
||||
BasicModelRequest: schema.BasicModelRequest{Model: name + ".gguf"},
|
||||
}
|
||||
if role == "asr" {
|
||||
modelConfig.Options = []string{"type=asr", "tokenizer=" + filepath.Join(relDir, "tokenizer.gguf")}
|
||||
} else {
|
||||
modelConfig.Options = []string{"tokenizer=" + filepath.Join(relDir, "tokenizer.gguf")}
|
||||
}
|
||||
data, err := yaml.Marshal(modelConfig)
|
||||
if err != nil {
|
||||
return gallery.ModelConfig{}, err
|
||||
}
|
||||
cfg.ConfigFile = string(data)
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
modelTarget := filepath.Join(relDir, filepath.Base(chosen.Path))
|
||||
cfg.Files = append(cfg.Files, gallery.File{
|
||||
URI: chosen.URL,
|
||||
Filename: modelTarget,
|
||||
SHA256: chosen.SHA256,
|
||||
})
|
||||
modelConfig.PredictionOptions = schema.PredictionOptions{
|
||||
BasicModelRequest: schema.BasicModelRequest{Model: modelTarget},
|
||||
}
|
||||
|
||||
// tokenizer.gguf is mandatory — Load() rejects without it. Always pull
|
||||
// it when the repo provides one (every official vibevoice.cpp bundle does).
|
||||
options := []string{}
|
||||
if role == "asr" {
|
||||
options = append(options, "type=asr")
|
||||
}
|
||||
if tok, ok := findFile(files, "tokenizer.gguf"); ok {
|
||||
tokTarget := filepath.Join(relDir, "tokenizer.gguf")
|
||||
cfg.Files = append(cfg.Files, gallery.File{
|
||||
URI: tok.URL,
|
||||
Filename: tokTarget,
|
||||
SHA256: tok.SHA256,
|
||||
})
|
||||
options = append(options, "tokenizer="+tokTarget)
|
||||
}
|
||||
|
||||
// For TTS, ship the first voice-*.gguf as a default — the backend needs
|
||||
// a reference voice to clone from. ASR doesn't use voice prompts.
|
||||
if role != "asr" {
|
||||
if voice, ok := pickVoicePrompt(files, stringPref(preferencesMap, "voice")); ok {
|
||||
voiceTarget := filepath.Join(relDir, filepath.Base(voice.Path))
|
||||
cfg.Files = append(cfg.Files, gallery.File{
|
||||
URI: voice.URL,
|
||||
Filename: voiceTarget,
|
||||
SHA256: voice.SHA256,
|
||||
})
|
||||
options = append(options, "voice="+voiceTarget)
|
||||
}
|
||||
}
|
||||
modelConfig.Options = options
|
||||
|
||||
data, err := yaml.Marshal(modelConfig)
|
||||
if err != nil {
|
||||
return gallery.ModelConfig{}, err
|
||||
}
|
||||
cfg.ConfigFile = string(data)
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
// hasVibeVoiceGGUF returns true when any file matches "vibevoice-*.gguf"
|
||||
// (case-insensitive). Narrow on purpose — third-party GGUF mirrors that
|
||||
// re-pack the model under different filenames will be missed, but those
|
||||
// users can pass preferences.backend="vibevoice-cpp" to force the importer.
|
||||
func hasVibeVoiceGGUF(files []hfapi.ModelFile) bool {
|
||||
for _, f := range files {
|
||||
name := strings.ToLower(filepath.Base(f.Path))
|
||||
if strings.HasPrefix(name, "vibevoice-") && strings.HasSuffix(name, ".gguf") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// filterByPrefix returns every file whose basename starts with prefix and
|
||||
// ends in .gguf (case-insensitive on the suffix, exact on the prefix).
|
||||
func filterByPrefix(files []hfapi.ModelFile, prefix string) []hfapi.ModelFile {
|
||||
var out []hfapi.ModelFile
|
||||
for _, f := range files {
|
||||
base := filepath.Base(f.Path)
|
||||
if !strings.HasPrefix(base, prefix) {
|
||||
continue
|
||||
}
|
||||
if !strings.HasSuffix(strings.ToLower(base), ".gguf") {
|
||||
continue
|
||||
}
|
||||
out = append(out, f)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// findFile is HasFile's lookup-returning sibling. Returns the first file
|
||||
// whose basename equals name (exact match), or false when none exists.
|
||||
func findFile(files []hfapi.ModelFile, name string) (hfapi.ModelFile, bool) {
|
||||
for _, f := range files {
|
||||
if filepath.Base(f.Path) == name {
|
||||
return f, true
|
||||
}
|
||||
}
|
||||
return hfapi.ModelFile{}, false
|
||||
}
|
||||
|
||||
// pickPreferredGGUFFile mirrors pickPreferredGGMLFile but operates on .gguf
|
||||
// files: walks prefs in order, returns the first file whose basename contains
|
||||
// any preference token (case-insensitive). On no match, falls back to the
|
||||
// last file so a missing quant still yields a runnable import.
|
||||
func pickPreferredGGUFFile(files []hfapi.ModelFile, prefs []string) (hfapi.ModelFile, bool) {
|
||||
if len(files) == 0 {
|
||||
return hfapi.ModelFile{}, false
|
||||
}
|
||||
for _, pref := range prefs {
|
||||
lower := strings.ToLower(strings.TrimSpace(pref))
|
||||
if lower == "" {
|
||||
continue
|
||||
}
|
||||
for _, f := range files {
|
||||
if strings.Contains(strings.ToLower(filepath.Base(f.Path)), lower) {
|
||||
return f, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return files[len(files)-1], true
|
||||
}
|
||||
|
||||
// pickVoicePrompt selects a voice-*.gguf to bundle with a TTS import.
|
||||
// Honours an explicit preferences.voice substring (e.g. "Emma" picks
|
||||
// voice-en-Emma.gguf); otherwise returns the first voice file in listing
|
||||
// order so the choice is stable across imports of the same repo.
|
||||
func pickVoicePrompt(files []hfapi.ModelFile, hint string) (hfapi.ModelFile, bool) {
|
||||
hint = strings.ToLower(strings.TrimSpace(hint))
|
||||
var voices []hfapi.ModelFile
|
||||
for _, f := range files {
|
||||
base := strings.ToLower(filepath.Base(f.Path))
|
||||
if strings.HasPrefix(base, "voice-") && strings.HasSuffix(base, ".gguf") {
|
||||
voices = append(voices, f)
|
||||
}
|
||||
}
|
||||
if len(voices) == 0 {
|
||||
return hfapi.ModelFile{}, false
|
||||
}
|
||||
if hint != "" {
|
||||
for _, v := range voices {
|
||||
if strings.Contains(strings.ToLower(filepath.Base(v.Path)), hint) {
|
||||
return v, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return voices[0], true
|
||||
}
|
||||
|
||||
// repoNameOnly extracts the repo basename (everything after the last "/")
|
||||
// from HF metadata or, failing that, the URI. Empty when neither is set.
|
||||
func repoNameOnly(details Details) string {
|
||||
if details.HuggingFace != nil {
|
||||
id := details.HuggingFace.ModelID
|
||||
if idx := strings.Index(id, "/"); idx >= 0 {
|
||||
return id[idx+1:]
|
||||
}
|
||||
return id
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// unmarshalPreferences decodes details.Preferences into a generic map. Returns
|
||||
// an empty map (never nil) on any failure so callers can index without nil
|
||||
// checks. Bad JSON is silently ignored — every importer here treats
|
||||
// preferences as best-effort hints.
|
||||
func unmarshalPreferences(raw json.RawMessage) map[string]any {
|
||||
out := map[string]any{}
|
||||
b, err := raw.MarshalJSON()
|
||||
if err != nil || len(b) == 0 {
|
||||
return out
|
||||
}
|
||||
_ = json.Unmarshal(b, &out)
|
||||
return out
|
||||
}
|
||||
|
||||
// stringPref reads a string preference by key, returning "" when missing or
|
||||
// of the wrong type.
|
||||
func stringPref(m map[string]any, key string) string {
|
||||
if v, ok := m[key].(string); ok {
|
||||
return v
|
||||
}
|
||||
return ""
|
||||
}
|
||||
261
core/gallery/importers/vibevoice-cpp_test.go
Normal file
261
core/gallery/importers/vibevoice-cpp_test.go
Normal file
@@ -0,0 +1,261 @@
|
||||
package importers_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"github.com/mudler/LocalAI/core/gallery/importers"
|
||||
hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("VibeVoiceCppImporter", func() {
|
||||
Context("Importer interface metadata", func() {
|
||||
It("exposes name/modality/autodetect", func() {
|
||||
imp := &importers.VibeVoiceCppImporter{}
|
||||
Expect(imp.Name()).To(Equal("vibevoice-cpp"))
|
||||
Expect(imp.Modality()).To(Equal("tts"))
|
||||
Expect(imp.AutoDetects()).To(BeTrue())
|
||||
})
|
||||
})
|
||||
|
||||
Context("preference override", func() {
|
||||
It("honours preferences.backend=vibevoice-cpp for arbitrary URIs", func() {
|
||||
uri := "https://example.com/some-unrelated-model"
|
||||
preferences := json.RawMessage(`{"backend": "vibevoice-cpp"}`)
|
||||
|
||||
modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
|
||||
|
||||
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: vibevoice-cpp"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("tokenizer=tokenizer.gguf"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("tts"))
|
||||
})
|
||||
|
||||
It("emits an ASR skeleton when usecase=asr is requested with no HF metadata", func() {
|
||||
uri := "https://example.com/some-unrelated-model"
|
||||
preferences := json.RawMessage(`{"backend": "vibevoice-cpp", "usecase": "asr"}`)
|
||||
|
||||
modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
|
||||
|
||||
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: vibevoice-cpp"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("type=asr"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("transcript"))
|
||||
})
|
||||
})
|
||||
|
||||
// Live HF call against the canonical bundle. Marked broad: it shouldn't
|
||||
// be brittle to upstream adding more quants/voices — we only assert that
|
||||
// the realtime TTS path was picked and the tokenizer was bundled.
|
||||
Context("detection from HuggingFace: mudler/vibevoice.cpp-models", func() {
|
||||
const uri = "https://huggingface.co/mudler/vibevoice.cpp-models"
|
||||
|
||||
It("routes to vibevoice-cpp, picks the realtime TTS GGUF and bundles tokenizer + voice prompt", func() {
|
||||
modelConfig, err := importers.DiscoverModelConfig(uri, json.RawMessage(`{}`))
|
||||
|
||||
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: vibevoice-cpp"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("tts"))
|
||||
|
||||
// Primary model must be the realtime variant (TTS default).
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("vibevoice-realtime-"))
|
||||
|
||||
// Tokenizer is mandatory and must show up both as a downloaded
|
||||
// file and as a tokenizer= option entry. The path is rooted
|
||||
// under vibevoice-cpp/<name>/ so multiple imports don't collide.
|
||||
var sawTokenizerFile, sawModelFile, sawVoiceFile bool
|
||||
for _, f := range modelConfig.Files {
|
||||
if f.Filename == "" {
|
||||
continue
|
||||
}
|
||||
if filepathBase(f.Filename) == "tokenizer.gguf" {
|
||||
sawTokenizerFile = true
|
||||
}
|
||||
if startsWith(filepathBase(f.Filename), "vibevoice-realtime-") {
|
||||
sawModelFile = true
|
||||
}
|
||||
if startsWith(filepathBase(f.Filename), "voice-") {
|
||||
sawVoiceFile = true
|
||||
}
|
||||
}
|
||||
Expect(sawTokenizerFile).To(BeTrue(), fmt.Sprintf("expected tokenizer.gguf in Files, got: %+v", modelConfig.Files))
|
||||
Expect(sawModelFile).To(BeTrue(), fmt.Sprintf("expected a vibevoice-realtime-*.gguf in Files, got: %+v", modelConfig.Files))
|
||||
Expect(sawVoiceFile).To(BeTrue(), fmt.Sprintf("expected a voice-*.gguf in Files, got: %+v", modelConfig.Files))
|
||||
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("tokenizer=vibevoice-cpp/"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("voice=vibevoice-cpp/"))
|
||||
})
|
||||
|
||||
It("routes to ASR + diarization when preferences.usecase=asr", func() {
|
||||
modelConfig, err := importers.DiscoverModelConfig(uri, json.RawMessage(`{"usecase":"asr"}`))
|
||||
|
||||
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: vibevoice-cpp"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("transcript"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("type=asr"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("vibevoice-asr-"))
|
||||
// ASR must NOT bundle a voice prompt — the backend ignores it
|
||||
// for transcription and we don't want gratuitous downloads.
|
||||
Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("voice="))
|
||||
})
|
||||
})
|
||||
|
||||
// Offline fixtures — assert the end-to-end shape of what the importer
|
||||
// emits without depending on HF availability or upstream file lists.
|
||||
Context("Import from HuggingFace file listing (offline)", func() {
|
||||
const repoBase = "https://huggingface.co/mudler/vibevoice.cpp-models/resolve/main/"
|
||||
|
||||
hfFile := func(path, sha string) hfapi.ModelFile {
|
||||
return hfapi.ModelFile{
|
||||
Path: path,
|
||||
SHA256: sha,
|
||||
URL: repoBase + path,
|
||||
}
|
||||
}
|
||||
|
||||
withHF := func(preferences string, files ...hfapi.ModelFile) importers.Details {
|
||||
d := importers.Details{
|
||||
URI: "https://huggingface.co/mudler/vibevoice.cpp-models",
|
||||
HuggingFace: &hfapi.ModelDetails{
|
||||
ModelID: "mudler/vibevoice.cpp-models",
|
||||
Files: files,
|
||||
},
|
||||
}
|
||||
if preferences != "" {
|
||||
d.Preferences = json.RawMessage(preferences)
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
It("defaults to TTS realtime + tokenizer + first voice, nested under vibevoice-cpp/<name>/", func() {
|
||||
imp := &importers.VibeVoiceCppImporter{}
|
||||
details := withHF(`{"name":"vibe"}`,
|
||||
hfFile("vibevoice-realtime-0.5B-q8_0.gguf", "aaa"),
|
||||
hfFile("vibevoice-asr-q4_k.gguf", "bbb"),
|
||||
hfFile("tokenizer.gguf", "ccc"),
|
||||
hfFile("voice-en-Carter_man.gguf", "ddd"),
|
||||
hfFile("voice-en-Emma.gguf", "eee"),
|
||||
hfFile("README.md", ""),
|
||||
)
|
||||
|
||||
modelConfig, err := imp.Import(details)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
Expect(modelConfig.Files).To(HaveLen(3))
|
||||
byName := map[string]string{}
|
||||
for _, f := range modelConfig.Files {
|
||||
byName[filepathBase(f.Filename)] = f.Filename
|
||||
}
|
||||
Expect(byName).To(HaveKey("vibevoice-realtime-0.5B-q8_0.gguf"))
|
||||
Expect(byName).To(HaveKey("tokenizer.gguf"))
|
||||
Expect(byName).To(HaveKey("voice-en-Carter_man.gguf"))
|
||||
Expect(byName["tokenizer.gguf"]).To(Equal("vibevoice-cpp/vibe/tokenizer.gguf"))
|
||||
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: vibevoice-cpp"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("model: vibevoice-cpp/vibe/vibevoice-realtime-0.5B-q8_0.gguf"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("- tokenizer=vibevoice-cpp/vibe/tokenizer.gguf"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("- voice=vibevoice-cpp/vibe/voice-en-Carter_man.gguf"))
|
||||
Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("type=asr"))
|
||||
})
|
||||
|
||||
It("routes to ASR when preferences.usecase=asr and skips voice prompts", func() {
|
||||
imp := &importers.VibeVoiceCppImporter{}
|
||||
details := withHF(`{"name":"vibe-asr","usecase":"asr"}`,
|
||||
hfFile("vibevoice-realtime-0.5B-q8_0.gguf", "aaa"),
|
||||
hfFile("vibevoice-asr-q4_k.gguf", "bbb"),
|
||||
hfFile("vibevoice-asr-q8_0.gguf", "fff"),
|
||||
hfFile("tokenizer.gguf", "ccc"),
|
||||
hfFile("voice-en-Emma.gguf", "ddd"),
|
||||
)
|
||||
|
||||
modelConfig, err := imp.Import(details)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
Expect(modelConfig.Files).To(HaveLen(2))
|
||||
byName := map[string]string{}
|
||||
for _, f := range modelConfig.Files {
|
||||
byName[filepathBase(f.Filename)] = f.Filename
|
||||
}
|
||||
// Default quant order picks q8_0 over q4_k.
|
||||
Expect(byName).To(HaveKey("vibevoice-asr-q8_0.gguf"))
|
||||
Expect(byName).To(HaveKey("tokenizer.gguf"))
|
||||
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("model: vibevoice-cpp/vibe-asr/vibevoice-asr-q8_0.gguf"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("- type=asr"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("- tokenizer=vibevoice-cpp/vibe-asr/tokenizer.gguf"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("transcript"))
|
||||
Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("voice="))
|
||||
})
|
||||
|
||||
It("honours preferences.quantizations to pick a specific quant", func() {
|
||||
imp := &importers.VibeVoiceCppImporter{}
|
||||
details := withHF(`{"name":"vibe","quantizations":"q4_k"}`,
|
||||
hfFile("vibevoice-asr-q4_k.gguf", "aaa"),
|
||||
hfFile("vibevoice-asr-q8_0.gguf", "bbb"),
|
||||
hfFile("tokenizer.gguf", "ccc"),
|
||||
)
|
||||
|
||||
modelConfig, err := imp.Import(details)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
// Repo only ships ASR — auto-routes to asr, picks the requested
|
||||
// quant, emits type=asr automatically.
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("model: vibevoice-cpp/vibe/vibevoice-asr-q4_k.gguf"))
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("- type=asr"))
|
||||
})
|
||||
|
||||
It("honours preferences.voice to pick a specific voice prompt", func() {
|
||||
imp := &importers.VibeVoiceCppImporter{}
|
||||
details := withHF(`{"name":"vibe","voice":"Emma"}`,
|
||||
hfFile("vibevoice-realtime-0.5B-q8_0.gguf", "aaa"),
|
||||
hfFile("tokenizer.gguf", "bbb"),
|
||||
hfFile("voice-en-Carter_man.gguf", "ccc"),
|
||||
hfFile("voice-en-Emma.gguf", "ddd"),
|
||||
)
|
||||
|
||||
modelConfig, err := imp.Import(details)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("- voice=vibevoice-cpp/vibe/voice-en-Emma.gguf"))
|
||||
Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("voice-en-Carter_man"))
|
||||
})
|
||||
})
|
||||
|
||||
// Make sure we don't regress the existing Python-backend importer for
|
||||
// repos that don't carry the C++ port's signal (e.g. microsoft/VibeVoice-1.5B).
|
||||
Context("non-cpp vibevoice repos still route to the Python importer", func() {
|
||||
It("does not claim microsoft/VibeVoice-1.5B (no GGUF / no .cpp suffix)", func() {
|
||||
imp := &importers.VibeVoiceCppImporter{}
|
||||
details := importers.Details{
|
||||
URI: "https://huggingface.co/microsoft/VibeVoice-1.5B",
|
||||
HuggingFace: &hfapi.ModelDetails{
|
||||
ModelID: "microsoft/VibeVoice-1.5B",
|
||||
Files: []hfapi.ModelFile{
|
||||
{Path: "config.json"},
|
||||
{Path: "model.safetensors"},
|
||||
},
|
||||
},
|
||||
Preferences: json.RawMessage(`{}`),
|
||||
}
|
||||
Expect(imp.Match(details)).To(BeFalse())
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
// filepathBase / startsWith are tiny helpers so the test file stays
|
||||
// stdlib-only and doesn't pull in path/filepath + strings just for the
|
||||
// expected-shape assertions.
|
||||
func filepathBase(p string) string {
|
||||
for i := len(p) - 1; i >= 0; i-- {
|
||||
if p[i] == '/' {
|
||||
return p[i+1:]
|
||||
}
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
func startsWith(s, prefix string) bool {
|
||||
return len(s) >= len(prefix) && s[:len(prefix)] == prefix
|
||||
}
|
||||
Reference in New Issue
Block a user