diff --git a/backend/go/sherpa-onnx/backend.go b/backend/go/sherpa-onnx/backend.go index 91b797aa0..0a092acf7 100644 --- a/backend/go/sherpa-onnx/backend.go +++ b/backend/go/sherpa-onnx/backend.go @@ -62,7 +62,7 @@ var ( shimVadConfigSetDebug func(uintptr, int32) shimCreateVad func(uintptr, float32) uintptr - // TTS (offline, VITS) config + // TTS (offline, VITS/Piper and Kokoro) config shimTtsConfigNew func() uintptr shimTtsConfigFree func(uintptr) shimTtsConfigSetVitsModel func(uintptr, string) @@ -76,6 +76,14 @@ var ( shimTtsConfigSetDebug func(uintptr, int32) shimTtsConfigSetProvider func(uintptr, string) shimTtsConfigSetMaxNumSentences func(uintptr, int32) + shimTtsConfigSetKokoroModel func(uintptr, string) + shimTtsConfigSetKokoroVoices func(uintptr, string) + shimTtsConfigSetKokoroTokens func(uintptr, string) + shimTtsConfigSetKokoroDataDir func(uintptr, string) + shimTtsConfigSetKokoroDictDir func(uintptr, string) + shimTtsConfigSetKokoroLexicon func(uintptr, string) + shimTtsConfigSetKokoroLang func(uintptr, string) + shimTtsConfigSetKokoroLengthScale func(uintptr, float32) shimCreateOfflineTts func(uintptr) uintptr // Offline recognizer config @@ -101,37 +109,37 @@ var ( shimCreateOfflineRecognizer func(uintptr) uintptr // Online recognizer config - shimOnlineRecogConfigNew func() uintptr - shimOnlineRecogConfigFree func(uintptr) - shimOnlineRecogConfigSetTransducerEncoder func(uintptr, string) - shimOnlineRecogConfigSetTransducerDecoder func(uintptr, string) - shimOnlineRecogConfigSetTransducerJoiner func(uintptr, string) - shimOnlineRecogConfigSetTokens func(uintptr, string) - shimOnlineRecogConfigSetNumThreads func(uintptr, int32) - shimOnlineRecogConfigSetDebug func(uintptr, int32) - shimOnlineRecogConfigSetProvider func(uintptr, string) - shimOnlineRecogConfigSetFeatSampleRate func(uintptr, int32) - shimOnlineRecogConfigSetFeatFeatureDim func(uintptr, int32) - shimOnlineRecogConfigSetDecodingMethod func(uintptr, string) - shimOnlineRecogConfigSetEnableEndpoint func(uintptr, int32) + shimOnlineRecogConfigNew func() uintptr + shimOnlineRecogConfigFree func(uintptr) + shimOnlineRecogConfigSetTransducerEncoder func(uintptr, string) + shimOnlineRecogConfigSetTransducerDecoder func(uintptr, string) + shimOnlineRecogConfigSetTransducerJoiner func(uintptr, string) + shimOnlineRecogConfigSetTokens func(uintptr, string) + shimOnlineRecogConfigSetNumThreads func(uintptr, int32) + shimOnlineRecogConfigSetDebug func(uintptr, int32) + shimOnlineRecogConfigSetProvider func(uintptr, string) + shimOnlineRecogConfigSetFeatSampleRate func(uintptr, int32) + shimOnlineRecogConfigSetFeatFeatureDim func(uintptr, int32) + shimOnlineRecogConfigSetDecodingMethod func(uintptr, string) + shimOnlineRecogConfigSetEnableEndpoint func(uintptr, int32) shimOnlineRecogConfigSetRule1MinTrailingSilence func(uintptr, float32) shimOnlineRecogConfigSetRule2MinTrailingSilence func(uintptr, float32) shimOnlineRecogConfigSetRule3MinUtteranceLength func(uintptr, float32) - shimCreateOnlineRecognizer func(uintptr) uintptr + shimCreateOnlineRecognizer func(uintptr) uintptr // Result accessors. Pointer returns use unsafe.Pointer so Go's // vet checker doesn't flag them — the returned memory is C-owned, // not subject to Go GC motion. - shimWaveSampleRate func(uintptr) int32 - shimWaveNumSamples func(uintptr) int32 - shimWaveSamples func(uintptr) unsafe.Pointer - shimOfflineResultText func(uintptr) unsafe.Pointer - shimOnlineResultText func(uintptr) unsafe.Pointer - shimGeneratedAudioSampleRate func(uintptr) int32 - shimGeneratedAudioN func(uintptr) int32 - shimGeneratedAudioSamples func(uintptr) unsafe.Pointer - shimSpeechSegmentStart func(uintptr) int32 - shimSpeechSegmentN func(uintptr) int32 + shimWaveSampleRate func(uintptr) int32 + shimWaveNumSamples func(uintptr) int32 + shimWaveSamples func(uintptr) unsafe.Pointer + shimOfflineResultText func(uintptr) unsafe.Pointer + shimOnlineResultText func(uintptr) unsafe.Pointer + shimGeneratedAudioSampleRate func(uintptr) int32 + shimGeneratedAudioN func(uintptr) int32 + shimGeneratedAudioSamples func(uintptr) unsafe.Pointer + shimSpeechSegmentStart func(uintptr) int32 + shimSpeechSegmentN func(uintptr) int32 // TTS streaming callback trampoline shimTtsGenerateWithCallback func(tts uintptr, text string, sid int32, speed float32, cb uintptr, ud uintptr) uintptr @@ -161,13 +169,13 @@ var ( // pointer returned by the shim or `unsafe.Pointer(&slice[0])` from Go. var ( // VAD - sherpaVadAcceptWaveform func(vad uintptr, samples unsafe.Pointer, n int32) - sherpaVadReset func(vad uintptr) - sherpaVadFlush func(vad uintptr) - sherpaVadEmpty func(vad uintptr) int32 - sherpaVadFront func(vad uintptr) uintptr - sherpaVadPop func(vad uintptr) - sherpaDestroySpeechSegment func(seg uintptr) + sherpaVadAcceptWaveform func(vad uintptr, samples unsafe.Pointer, n int32) + sherpaVadReset func(vad uintptr) + sherpaVadFlush func(vad uintptr) + sherpaVadEmpty func(vad uintptr) int32 + sherpaVadFront func(vad uintptr) uintptr + sherpaVadPop func(vad uintptr) + sherpaDestroySpeechSegment func(seg uintptr) // Wave IO sherpaReadWave func(filename string) uintptr @@ -175,11 +183,11 @@ var ( sherpaWriteWave func(samples unsafe.Pointer, n int32, sampleRate int32, filename string) int32 // Offline ASR - sherpaCreateOfflineStream func(rec uintptr) uintptr - sherpaDestroyOfflineStream func(stream uintptr) - sherpaAcceptWaveformOffline func(stream uintptr, sr int32, samples unsafe.Pointer, n int32) - sherpaDecodeOfflineStream func(rec uintptr, stream uintptr) - sherpaGetOfflineStreamResult func(stream uintptr) uintptr + sherpaCreateOfflineStream func(rec uintptr) uintptr + sherpaDestroyOfflineStream func(stream uintptr) + sherpaAcceptWaveformOffline func(stream uintptr, sr int32, samples unsafe.Pointer, n int32) + sherpaDecodeOfflineStream func(rec uintptr, stream uintptr) + sherpaGetOfflineStreamResult func(stream uintptr) uintptr sherpaDestroyOfflineRecognizerResult func(result uintptr) // Online ASR @@ -195,21 +203,21 @@ var ( sherpaOnlineStreamInputFinished func(stream uintptr) // TTS - sherpaOfflineTtsGenerate func(tts uintptr, text string, sid int32, speed float32) uintptr + sherpaOfflineTtsGenerate func(tts uintptr, text string, sid int32, speed float32) uintptr sherpaDestroyOfflineTtsGeneratedAudio func(audio uintptr) - sherpaOfflineTtsSampleRate func(tts uintptr) int32 + sherpaOfflineTtsSampleRate func(tts uintptr) int32 // Offline speaker diarization. Result handle owns the segment-array // pointer returned by ResultSortByStartTime; destroy the segment // array first, then the result, then (at backend Free()) the diarizer. - sherpaDestroyOfflineSpeakerDiarization func(sd uintptr) - sherpaOfflineSpeakerDiarizationGetSampleRate func(sd uintptr) int32 - sherpaOfflineSpeakerDiarizationProcess func(sd uintptr, samples unsafe.Pointer, n int32) uintptr - sherpaOfflineSpeakerDiarizationResultGetNumSegments func(result uintptr) int32 - sherpaOfflineSpeakerDiarizationResultGetNumSpeakers func(result uintptr) int32 - sherpaOfflineSpeakerDiarizationResultSortByStartTime func(result uintptr) uintptr - sherpaOfflineSpeakerDiarizationDestroySegment func(segs uintptr) - sherpaDestroyOfflineSpeakerDiarizationResult func(result uintptr) + sherpaDestroyOfflineSpeakerDiarization func(sd uintptr) + sherpaOfflineSpeakerDiarizationGetSampleRate func(sd uintptr) int32 + sherpaOfflineSpeakerDiarizationProcess func(sd uintptr, samples unsafe.Pointer, n int32) uintptr + sherpaOfflineSpeakerDiarizationResultGetNumSegments func(result uintptr) int32 + sherpaOfflineSpeakerDiarizationResultGetNumSpeakers func(result uintptr) int32 + sherpaOfflineSpeakerDiarizationResultSortByStartTime func(result uintptr) uintptr + sherpaOfflineSpeakerDiarizationDestroySegment func(segs uintptr) + sherpaDestroyOfflineSpeakerDiarizationResult func(result uintptr) ) var ( @@ -278,6 +286,14 @@ func loadSherpaLibsOnce() error { {&shimTtsConfigSetDebug, "sherpa_shim_tts_config_set_debug"}, {&shimTtsConfigSetProvider, "sherpa_shim_tts_config_set_provider"}, {&shimTtsConfigSetMaxNumSentences, "sherpa_shim_tts_config_set_max_num_sentences"}, + {&shimTtsConfigSetKokoroModel, "sherpa_shim_tts_config_set_kokoro_model"}, + {&shimTtsConfigSetKokoroVoices, "sherpa_shim_tts_config_set_kokoro_voices"}, + {&shimTtsConfigSetKokoroTokens, "sherpa_shim_tts_config_set_kokoro_tokens"}, + {&shimTtsConfigSetKokoroDataDir, "sherpa_shim_tts_config_set_kokoro_data_dir"}, + {&shimTtsConfigSetKokoroDictDir, "sherpa_shim_tts_config_set_kokoro_dict_dir"}, + {&shimTtsConfigSetKokoroLexicon, "sherpa_shim_tts_config_set_kokoro_lexicon"}, + {&shimTtsConfigSetKokoroLang, "sherpa_shim_tts_config_set_kokoro_lang"}, + {&shimTtsConfigSetKokoroLengthScale, "sherpa_shim_tts_config_set_kokoro_length_scale"}, {&shimCreateOfflineTts, "sherpa_shim_create_offline_tts"}, {&shimOfflineRecogConfigNew, "sherpa_shim_offline_recog_config_new"}, @@ -688,21 +704,14 @@ func (s *SherpaBackend) loadTTS(opts *pb.ModelOptions) error { cfg := shimTtsConfigNew() defer shimTtsConfigFree(cfg) - shimTtsConfigSetVitsModel(cfg, modelFile) - - if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) { - shimTtsConfigSetVitsTokens(cfg, tokensPath) + // Kokoro models ship a voices style file alongside the ONNX, whereas + // VITS/Piper voices do not. That presence is what tells the two model + // families apart, since both arrive as a plain *.onnx in modelDir. + if isKokoroModel(modelDir) { + s.configureKokoroTTS(cfg, opts, modelFile, modelDir) + } else { + s.configureVitsTTS(cfg, opts, modelFile, modelDir) } - if lexiconPath := filepath.Join(modelDir, "lexicon.txt"); fileExists(lexiconPath) { - shimTtsConfigSetVitsLexicon(cfg, lexiconPath) - } - if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) { - shimTtsConfigSetVitsDataDir(cfg, dataDir) - } - - shimTtsConfigSetVitsNoiseScale(cfg, findOptionFloat(opts, optionTtsNoiseScale, 0.667)) - shimTtsConfigSetVitsNoiseScaleW(cfg, findOptionFloat(opts, optionTtsNoiseScaleW, 0.8)) - shimTtsConfigSetVitsLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0)) threads := int32(1) if opts.Threads != 0 { @@ -723,6 +732,80 @@ func (s *SherpaBackend) loadTTS(opts *pb.ModelOptions) error { return nil } +// kokoroVoicesFile is the speaker-style bank that ships with Kokoro models and +// is absent from VITS/Piper voices; its presence is how loadTTS tells them apart. +const kokoroVoicesFile = "voices.bin" + +// isKokoroModel reports whether modelDir holds a Kokoro model (a voices file +// next to the ONNX) rather than a VITS/Piper single-speaker model. +func isKokoroModel(modelDir string) bool { + return fileExists(filepath.Join(modelDir, kokoroVoicesFile)) +} + +// configureVitsTTS wires a VITS/Piper single-speaker model into cfg: the ONNX +// plus the optional tokens, lexicon and espeak-ng-data found beside it. +func (s *SherpaBackend) configureVitsTTS(cfg uintptr, opts *pb.ModelOptions, modelFile, modelDir string) { + shimTtsConfigSetVitsModel(cfg, modelFile) + + if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) { + shimTtsConfigSetVitsTokens(cfg, tokensPath) + } + if lexiconPath := filepath.Join(modelDir, "lexicon.txt"); fileExists(lexiconPath) { + shimTtsConfigSetVitsLexicon(cfg, lexiconPath) + } + if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) { + shimTtsConfigSetVitsDataDir(cfg, dataDir) + } + + shimTtsConfigSetVitsNoiseScale(cfg, findOptionFloat(opts, optionTtsNoiseScale, 0.667)) + shimTtsConfigSetVitsNoiseScaleW(cfg, findOptionFloat(opts, optionTtsNoiseScaleW, 0.8)) + shimTtsConfigSetVitsLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0)) +} + +// configureKokoroTTS wires a Kokoro model into cfg: the ONNX, its voices bank, +// tokens, and the optional espeak-ng-data / jieba dict / lexicon assets the +// multi-lingual packs ship. A language hint comes from the `language=` option. +func (s *SherpaBackend) configureKokoroTTS(cfg uintptr, opts *pb.ModelOptions, modelFile, modelDir string) { + shimTtsConfigSetKokoroModel(cfg, modelFile) + shimTtsConfigSetKokoroVoices(cfg, filepath.Join(modelDir, kokoroVoicesFile)) + + if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) { + shimTtsConfigSetKokoroTokens(cfg, tokensPath) + } + if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) { + shimTtsConfigSetKokoroDataDir(cfg, dataDir) + } + if dictDir := filepath.Join(modelDir, "dict"); dirExists(dictDir) { + shimTtsConfigSetKokoroDictDir(cfg, dictDir) + } + + // Multi-lingual Kokoro ships per-language lexicons; the C API takes them as + // a single comma-separated list. US and GB English overlap almost entirely, + // so pass only one (US preferred) to avoid tens of thousands of "duplicated + // word" warnings at load; non-English lexicons (e.g. zh) are additive. + var lexicons []string + addLexicon := func(name string) { + if p := filepath.Join(modelDir, name); fileExists(p) { + lexicons = append(lexicons, p) + } + } + if fileExists(filepath.Join(modelDir, "lexicon-us-en.txt")) { + addLexicon("lexicon-us-en.txt") + } else { + addLexicon("lexicon-gb-en.txt") + } + addLexicon("lexicon-zh.txt") + addLexicon("lexicon.txt") + if len(lexicons) > 0 { + shimTtsConfigSetKokoroLexicon(cfg, strings.Join(lexicons, ",")) + } + + if lang := findOptionValue(opts, optionLanguage, ""); lang != "" { + shimTtsConfigSetKokoroLang(cfg, lang) + } + shimTtsConfigSetKokoroLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0)) +} + func fileExists(p string) bool { info, err := os.Stat(p) return err == nil && !info.IsDir() @@ -1252,7 +1335,7 @@ type ttsStreamState struct { var ( ttsStates sync.Map // uint64 → *ttsStreamState ttsNextID atomic.Uint64 - ttsCallbackPtr uintptr // purego.NewCallback return; registered in loadSherpaLibs + ttsCallbackPtr uintptr // purego.NewCallback return; registered in loadSherpaLibs ) // ttsStreamCallback is invoked by sherpa-onnx for each PCM chunk VITS diff --git a/backend/go/sherpa-onnx/backend_test.go b/backend/go/sherpa-onnx/backend_test.go index b70bc3e67..5e3723b04 100644 --- a/backend/go/sherpa-onnx/backend_test.go +++ b/backend/go/sherpa-onnx/backend_test.go @@ -124,6 +124,20 @@ var _ = Describe("Sherpa-ONNX", func() { Entry("empty", "", false), Entry("other", "other", false), ) + + It("isKokoroModel detects a voices file beside the ONNX", func() { + dir, err := os.MkdirTemp("", "sherpa-kokoro-*") + Expect(err).NotTo(HaveOccurred()) + defer func() { _ = os.RemoveAll(dir) }() + + // A bare VITS/Piper directory (ONNX only) is not Kokoro. + Expect(os.WriteFile(filepath.Join(dir, "model.onnx"), []byte("x"), 0o600)).To(Succeed()) + Expect(isKokoroModel(dir)).To(BeFalse()) + + // Adding the Kokoro voices bank flips detection on. + Expect(os.WriteFile(filepath.Join(dir, kokoroVoicesFile), []byte("x"), 0o600)).To(Succeed()) + Expect(isKokoroModel(dir)).To(BeTrue()) + }) }) Context("option parsing", func() { diff --git a/backend/go/sherpa-onnx/csrc/shim.c b/backend/go/sherpa-onnx/csrc/shim.c index f6cae4453..95d22b31a 100644 --- a/backend/go/sherpa-onnx/csrc/shim.c +++ b/backend/go/sherpa-onnx/csrc/shim.c @@ -79,6 +79,13 @@ void sherpa_shim_tts_config_free(void *h) { free((char *)c->model.vits.tokens); free((char *)c->model.vits.lexicon); free((char *)c->model.vits.data_dir); + free((char *)c->model.kokoro.model); + free((char *)c->model.kokoro.voices); + free((char *)c->model.kokoro.tokens); + free((char *)c->model.kokoro.data_dir); + free((char *)c->model.kokoro.dict_dir); + free((char *)c->model.kokoro.lexicon); + free((char *)c->model.kokoro.lang); free((char *)c->model.provider); free(c); } @@ -117,6 +124,34 @@ void sherpa_shim_tts_config_set_max_num_sentences(void *h, int32_t v) { ((SherpaOnnxOfflineTtsConfig *)h)->max_num_sentences = v; } +// Kokoro multi-speaker / multi-lingual TTS. Distinct ONNX + a voices style +// file (voices.bin) instead of VITS' single-speaker graph; espeak-ng-data, +// lexicon and a language hint are optional refinements. +void sherpa_shim_tts_config_set_kokoro_model(void *h, const char *v) { + shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.model, v); +} +void sherpa_shim_tts_config_set_kokoro_voices(void *h, const char *v) { + shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.voices, v); +} +void sherpa_shim_tts_config_set_kokoro_tokens(void *h, const char *v) { + shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.tokens, v); +} +void sherpa_shim_tts_config_set_kokoro_data_dir(void *h, const char *v) { + shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.data_dir, v); +} +void sherpa_shim_tts_config_set_kokoro_dict_dir(void *h, const char *v) { + shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.dict_dir, v); +} +void sherpa_shim_tts_config_set_kokoro_lexicon(void *h, const char *v) { + shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.lexicon, v); +} +void sherpa_shim_tts_config_set_kokoro_lang(void *h, const char *v) { + shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.lang, v); +} +void sherpa_shim_tts_config_set_kokoro_length_scale(void *h, float v) { + ((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.length_scale = v; +} + void *sherpa_shim_create_offline_tts(void *h) { return (void *)SherpaOnnxCreateOfflineTts( (const SherpaOnnxOfflineTtsConfig *)h); diff --git a/backend/go/sherpa-onnx/csrc/shim.h b/backend/go/sherpa-onnx/csrc/shim.h index 7b9b249cc..50c5b515e 100644 --- a/backend/go/sherpa-onnx/csrc/shim.h +++ b/backend/go/sherpa-onnx/csrc/shim.h @@ -37,7 +37,7 @@ void sherpa_shim_vad_config_set_provider(void *cfg, const char *v); void sherpa_shim_vad_config_set_debug(void *cfg, int32_t v); void *sherpa_shim_create_vad(void *cfg, float buffer_size_seconds); -// --- Offline TTS config (VITS path — the only TTS family the backend uses) --- +// --- Offline TTS config (VITS/Piper and Kokoro model families) --- void *sherpa_shim_tts_config_new(void); void sherpa_shim_tts_config_free(void *cfg); void sherpa_shim_tts_config_set_vits_model(void *cfg, const char *v); @@ -51,6 +51,14 @@ void sherpa_shim_tts_config_set_num_threads(void *cfg, int32_t v); void sherpa_shim_tts_config_set_debug(void *cfg, int32_t v); void sherpa_shim_tts_config_set_provider(void *cfg, const char *v); void sherpa_shim_tts_config_set_max_num_sentences(void *cfg, int32_t v); +void sherpa_shim_tts_config_set_kokoro_model(void *cfg, const char *v); +void sherpa_shim_tts_config_set_kokoro_voices(void *cfg, const char *v); +void sherpa_shim_tts_config_set_kokoro_tokens(void *cfg, const char *v); +void sherpa_shim_tts_config_set_kokoro_data_dir(void *cfg, const char *v); +void sherpa_shim_tts_config_set_kokoro_dict_dir(void *cfg, const char *v); +void sherpa_shim_tts_config_set_kokoro_lexicon(void *cfg, const char *v); +void sherpa_shim_tts_config_set_kokoro_lang(void *cfg, const char *v); +void sherpa_shim_tts_config_set_kokoro_length_scale(void *cfg, float v); void *sherpa_shim_create_offline_tts(void *cfg); // --- Offline recognizer config (Whisper / Paraformer / SenseVoice / Omnilingual) --- diff --git a/gallery/index.yaml b/gallery/index.yaml index 287dac588..3ca95556e 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -2764,6 +2764,171 @@ - filename: vits-ljs/lexicon.txt sha256: bdccfc6da71c45c48e2e0056fcf0aab760577c5f959f6c1b5eb3e3e916fd5a0e uri: https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt +- name: vits-piper-it_IT-paola-sherpa + url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master + urls: + - https://github.com/k2-fsa/sherpa-onnx + - https://huggingface.co/datasets/paolapersico1/Voice-Dataset-Italian + description: | + Italian (it_IT) single-speaker Piper VITS voice "paola" (medium quality, 22.05 kHz), served through the sherpa-onnx backend with native streaming TTS. Ships espeak-ng phonemization data, so it works for Italian out of the box. + license: other + icon: https://avatars.githubusercontent.com/u/75781706 + tags: + - vits + - piper + - text-to-speech + - tts + - italian + - onnx + - sherpa-onnx + - single-speaker + last_checked: "2026-06-13" + overrides: + known_usecases: + - tts + parameters: + model: vits-piper-it_IT-paola-medium/it_IT-paola-medium.onnx + files: + - filename: vits-piper-it_IT-paola-medium.tar.bz2 + sha256: 7541f75778afa164e44e34baaef63befad7698595df26a95ca944b63ef1a16b4 + uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-it_IT-paola-medium.tar.bz2 +- name: vits-piper-en_US-amy-sherpa + url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master + urls: + - https://github.com/k2-fsa/sherpa-onnx + - https://github.com/MycroftAI/mimic3-voices + description: | + English (en_US) single-speaker Piper VITS voice "amy" (medium quality, 22.05 kHz), served through the sherpa-onnx backend with native streaming TTS. Ships espeak-ng phonemization data. + license: other + icon: https://avatars.githubusercontent.com/u/75781706 + tags: + - vits + - piper + - text-to-speech + - tts + - english + - onnx + - sherpa-onnx + - single-speaker + last_checked: "2026-06-13" + overrides: + known_usecases: + - tts + parameters: + model: vits-piper-en_US-amy-medium/en_US-amy-medium.onnx + files: + - filename: vits-piper-en_US-amy-medium.tar.bz2 + sha256: 9a5d1fc497f85e8022b785bff5f8105203b1e33099ee6265203efc70b0cb0264 + uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-medium.tar.bz2 +- name: vits-piper-es_ES-davefx-sherpa + url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master + urls: + - https://github.com/k2-fsa/sherpa-onnx + description: | + Spanish (es_ES) single-speaker Piper VITS voice "davefx" (medium quality, 22.05 kHz), served through the sherpa-onnx backend with native streaming TTS. Ships espeak-ng phonemization data. + license: cc0-1.0 + icon: https://avatars.githubusercontent.com/u/75781706 + tags: + - vits + - piper + - text-to-speech + - tts + - spanish + - onnx + - sherpa-onnx + - single-speaker + last_checked: "2026-06-13" + overrides: + known_usecases: + - tts + parameters: + model: vits-piper-es_ES-davefx-medium/es_ES-davefx-medium.onnx + files: + - filename: vits-piper-es_ES-davefx-medium.tar.bz2 + sha256: a3f6beb54a9cb893279f72978a22f807a4d9fc9c7848157b524d5cc7b7f58b22 + uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-es_ES-davefx-medium.tar.bz2 +- name: vits-piper-fr_FR-siwis-sherpa + url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master + urls: + - https://github.com/k2-fsa/sherpa-onnx + description: | + French (fr_FR) single-speaker Piper VITS voice "siwis" (medium quality, 22.05 kHz), served through the sherpa-onnx backend with native streaming TTS. Ships espeak-ng phonemization data. + license: cc-by-4.0 + icon: https://avatars.githubusercontent.com/u/75781706 + tags: + - vits + - piper + - text-to-speech + - tts + - french + - onnx + - sherpa-onnx + - single-speaker + last_checked: "2026-06-13" + overrides: + known_usecases: + - tts + parameters: + model: vits-piper-fr_FR-siwis-medium/fr_FR-siwis-medium.onnx + files: + - filename: vits-piper-fr_FR-siwis-medium.tar.bz2 + sha256: 375909aa30842b3a4efa10b1beb1d761af792960ae6873b4d53889f96c66195b + uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-fr_FR-siwis-medium.tar.bz2 +- name: vits-piper-de_DE-thorsten-sherpa + url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master + urls: + - https://github.com/k2-fsa/sherpa-onnx + description: | + German (de_DE) single-speaker Piper VITS voice "thorsten" (medium quality, 22.05 kHz), served through the sherpa-onnx backend with native streaming TTS. Ships espeak-ng phonemization data. + license: cc0-1.0 + icon: https://avatars.githubusercontent.com/u/75781706 + tags: + - vits + - piper + - text-to-speech + - tts + - german + - onnx + - sherpa-onnx + - single-speaker + last_checked: "2026-06-13" + overrides: + known_usecases: + - tts + parameters: + model: vits-piper-de_DE-thorsten-medium/de_DE-thorsten-medium.onnx + files: + - filename: vits-piper-de_DE-thorsten-medium.tar.bz2 + sha256: 50487d9c95fdf2191f31d2588569381063ba1591dcd4c7d4bdd30f12b2191714 + uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-de_DE-thorsten-medium.tar.bz2 +- name: kokoro-multi-lang-v1.0-sherpa + url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master + urls: + - https://github.com/k2-fsa/sherpa-onnx + - https://huggingface.co/hexgrad/Kokoro-82M + description: | + Kokoro multi-lingual TTS (v1.0, int8) served through the sherpa-onnx backend with native streaming TTS. A single model covers many languages and speakers (English, Italian, Spanish, French, German and more) via a built-in voices bank; espeak-ng data and per-language lexicons ship with it. Select a speaker with the `voice` parameter (numeric speaker id) and optionally pass `language=` to hint the language. + license: apache-2.0 + icon: https://avatars.githubusercontent.com/u/75781706 + tags: + - kokoro + - text-to-speech + - tts + - multilingual + - italian + - english + - onnx + - sherpa-onnx + last_checked: "2026-06-13" + overrides: + known_usecases: + - tts + parameters: + model: kokoro-int8-multi-lang-v1_0/model.int8.onnx + files: + - filename: kokoro-int8-multi-lang-v1_0.tar.bz2 + sha256: 75654a84864be26f345f020f4070c2c019e96dd1b7f9bf6e2ffd59efac6aa5a3 + uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-int8-multi-lang-v1_0.tar.bz2 - name: voxcpm-1.5 url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: