mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-14 03:37:47 -04:00
feat(sherpa-onnx): add Kokoro TTS + multilingual Piper voices (#10309)
Wire the Kokoro model family into the sherpa-onnx backend (which only
supported VITS/Piper before) and add gallery voices for Italian, English,
Spanish, French and German plus a multilingual Kokoro model.
- csrc/shim.{c,h}: kokoro_* config setters (model/voices/tokens/data_dir/
dict_dir/lexicon/lang/length_scale) mirroring the VITS path, with the
matching frees in tts_config_free.
- backend.go: loadTTS now detects a Kokoro model (a voices.bin beside the
ONNX) and routes to configureKokoroTTS, otherwise configureVitsTTS.
Kokoro picks up espeak-ng-data, the jieba dict and the per-language
lexicons (only one English variant, to avoid tens of thousands of
duplicate-word warnings at load); the language= option hints the lang.
- backend_test.go: functional test for isKokoroModel detection.
- gallery: 5 Piper VITS voices (it_IT-paola, en_US-amy, es_ES-davefx,
fr_FR-siwis, de_DE-thorsten) + kokoro-multi-lang-v1.0, served through
sherpa-onnx-tts.yaml with native streaming TTS.
Verified by building the backend and synthesizing with a real Piper and
Kokoro model (31/31 specs pass, including real-model synth smokes).
Assisted-by: Claude:claude-opus-4-8 gofmt golangci-lint go-test
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -62,7 +62,7 @@ var (
|
||||
shimVadConfigSetDebug func(uintptr, int32)
|
||||
shimCreateVad func(uintptr, float32) uintptr
|
||||
|
||||
// TTS (offline, VITS) config
|
||||
// TTS (offline, VITS/Piper and Kokoro) config
|
||||
shimTtsConfigNew func() uintptr
|
||||
shimTtsConfigFree func(uintptr)
|
||||
shimTtsConfigSetVitsModel func(uintptr, string)
|
||||
@@ -76,6 +76,14 @@ var (
|
||||
shimTtsConfigSetDebug func(uintptr, int32)
|
||||
shimTtsConfigSetProvider func(uintptr, string)
|
||||
shimTtsConfigSetMaxNumSentences func(uintptr, int32)
|
||||
shimTtsConfigSetKokoroModel func(uintptr, string)
|
||||
shimTtsConfigSetKokoroVoices func(uintptr, string)
|
||||
shimTtsConfigSetKokoroTokens func(uintptr, string)
|
||||
shimTtsConfigSetKokoroDataDir func(uintptr, string)
|
||||
shimTtsConfigSetKokoroDictDir func(uintptr, string)
|
||||
shimTtsConfigSetKokoroLexicon func(uintptr, string)
|
||||
shimTtsConfigSetKokoroLang func(uintptr, string)
|
||||
shimTtsConfigSetKokoroLengthScale func(uintptr, float32)
|
||||
shimCreateOfflineTts func(uintptr) uintptr
|
||||
|
||||
// Offline recognizer config
|
||||
@@ -101,37 +109,37 @@ var (
|
||||
shimCreateOfflineRecognizer func(uintptr) uintptr
|
||||
|
||||
// Online recognizer config
|
||||
shimOnlineRecogConfigNew func() uintptr
|
||||
shimOnlineRecogConfigFree func(uintptr)
|
||||
shimOnlineRecogConfigSetTransducerEncoder func(uintptr, string)
|
||||
shimOnlineRecogConfigSetTransducerDecoder func(uintptr, string)
|
||||
shimOnlineRecogConfigSetTransducerJoiner func(uintptr, string)
|
||||
shimOnlineRecogConfigSetTokens func(uintptr, string)
|
||||
shimOnlineRecogConfigSetNumThreads func(uintptr, int32)
|
||||
shimOnlineRecogConfigSetDebug func(uintptr, int32)
|
||||
shimOnlineRecogConfigSetProvider func(uintptr, string)
|
||||
shimOnlineRecogConfigSetFeatSampleRate func(uintptr, int32)
|
||||
shimOnlineRecogConfigSetFeatFeatureDim func(uintptr, int32)
|
||||
shimOnlineRecogConfigSetDecodingMethod func(uintptr, string)
|
||||
shimOnlineRecogConfigSetEnableEndpoint func(uintptr, int32)
|
||||
shimOnlineRecogConfigNew func() uintptr
|
||||
shimOnlineRecogConfigFree func(uintptr)
|
||||
shimOnlineRecogConfigSetTransducerEncoder func(uintptr, string)
|
||||
shimOnlineRecogConfigSetTransducerDecoder func(uintptr, string)
|
||||
shimOnlineRecogConfigSetTransducerJoiner func(uintptr, string)
|
||||
shimOnlineRecogConfigSetTokens func(uintptr, string)
|
||||
shimOnlineRecogConfigSetNumThreads func(uintptr, int32)
|
||||
shimOnlineRecogConfigSetDebug func(uintptr, int32)
|
||||
shimOnlineRecogConfigSetProvider func(uintptr, string)
|
||||
shimOnlineRecogConfigSetFeatSampleRate func(uintptr, int32)
|
||||
shimOnlineRecogConfigSetFeatFeatureDim func(uintptr, int32)
|
||||
shimOnlineRecogConfigSetDecodingMethod func(uintptr, string)
|
||||
shimOnlineRecogConfigSetEnableEndpoint func(uintptr, int32)
|
||||
shimOnlineRecogConfigSetRule1MinTrailingSilence func(uintptr, float32)
|
||||
shimOnlineRecogConfigSetRule2MinTrailingSilence func(uintptr, float32)
|
||||
shimOnlineRecogConfigSetRule3MinUtteranceLength func(uintptr, float32)
|
||||
shimCreateOnlineRecognizer func(uintptr) uintptr
|
||||
shimCreateOnlineRecognizer func(uintptr) uintptr
|
||||
|
||||
// Result accessors. Pointer returns use unsafe.Pointer so Go's
|
||||
// vet checker doesn't flag them — the returned memory is C-owned,
|
||||
// not subject to Go GC motion.
|
||||
shimWaveSampleRate func(uintptr) int32
|
||||
shimWaveNumSamples func(uintptr) int32
|
||||
shimWaveSamples func(uintptr) unsafe.Pointer
|
||||
shimOfflineResultText func(uintptr) unsafe.Pointer
|
||||
shimOnlineResultText func(uintptr) unsafe.Pointer
|
||||
shimGeneratedAudioSampleRate func(uintptr) int32
|
||||
shimGeneratedAudioN func(uintptr) int32
|
||||
shimGeneratedAudioSamples func(uintptr) unsafe.Pointer
|
||||
shimSpeechSegmentStart func(uintptr) int32
|
||||
shimSpeechSegmentN func(uintptr) int32
|
||||
shimWaveSampleRate func(uintptr) int32
|
||||
shimWaveNumSamples func(uintptr) int32
|
||||
shimWaveSamples func(uintptr) unsafe.Pointer
|
||||
shimOfflineResultText func(uintptr) unsafe.Pointer
|
||||
shimOnlineResultText func(uintptr) unsafe.Pointer
|
||||
shimGeneratedAudioSampleRate func(uintptr) int32
|
||||
shimGeneratedAudioN func(uintptr) int32
|
||||
shimGeneratedAudioSamples func(uintptr) unsafe.Pointer
|
||||
shimSpeechSegmentStart func(uintptr) int32
|
||||
shimSpeechSegmentN func(uintptr) int32
|
||||
|
||||
// TTS streaming callback trampoline
|
||||
shimTtsGenerateWithCallback func(tts uintptr, text string, sid int32, speed float32, cb uintptr, ud uintptr) uintptr
|
||||
@@ -161,13 +169,13 @@ var (
|
||||
// pointer returned by the shim or `unsafe.Pointer(&slice[0])` from Go.
|
||||
var (
|
||||
// VAD
|
||||
sherpaVadAcceptWaveform func(vad uintptr, samples unsafe.Pointer, n int32)
|
||||
sherpaVadReset func(vad uintptr)
|
||||
sherpaVadFlush func(vad uintptr)
|
||||
sherpaVadEmpty func(vad uintptr) int32
|
||||
sherpaVadFront func(vad uintptr) uintptr
|
||||
sherpaVadPop func(vad uintptr)
|
||||
sherpaDestroySpeechSegment func(seg uintptr)
|
||||
sherpaVadAcceptWaveform func(vad uintptr, samples unsafe.Pointer, n int32)
|
||||
sherpaVadReset func(vad uintptr)
|
||||
sherpaVadFlush func(vad uintptr)
|
||||
sherpaVadEmpty func(vad uintptr) int32
|
||||
sherpaVadFront func(vad uintptr) uintptr
|
||||
sherpaVadPop func(vad uintptr)
|
||||
sherpaDestroySpeechSegment func(seg uintptr)
|
||||
|
||||
// Wave IO
|
||||
sherpaReadWave func(filename string) uintptr
|
||||
@@ -175,11 +183,11 @@ var (
|
||||
sherpaWriteWave func(samples unsafe.Pointer, n int32, sampleRate int32, filename string) int32
|
||||
|
||||
// Offline ASR
|
||||
sherpaCreateOfflineStream func(rec uintptr) uintptr
|
||||
sherpaDestroyOfflineStream func(stream uintptr)
|
||||
sherpaAcceptWaveformOffline func(stream uintptr, sr int32, samples unsafe.Pointer, n int32)
|
||||
sherpaDecodeOfflineStream func(rec uintptr, stream uintptr)
|
||||
sherpaGetOfflineStreamResult func(stream uintptr) uintptr
|
||||
sherpaCreateOfflineStream func(rec uintptr) uintptr
|
||||
sherpaDestroyOfflineStream func(stream uintptr)
|
||||
sherpaAcceptWaveformOffline func(stream uintptr, sr int32, samples unsafe.Pointer, n int32)
|
||||
sherpaDecodeOfflineStream func(rec uintptr, stream uintptr)
|
||||
sherpaGetOfflineStreamResult func(stream uintptr) uintptr
|
||||
sherpaDestroyOfflineRecognizerResult func(result uintptr)
|
||||
|
||||
// Online ASR
|
||||
@@ -195,21 +203,21 @@ var (
|
||||
sherpaOnlineStreamInputFinished func(stream uintptr)
|
||||
|
||||
// TTS
|
||||
sherpaOfflineTtsGenerate func(tts uintptr, text string, sid int32, speed float32) uintptr
|
||||
sherpaOfflineTtsGenerate func(tts uintptr, text string, sid int32, speed float32) uintptr
|
||||
sherpaDestroyOfflineTtsGeneratedAudio func(audio uintptr)
|
||||
sherpaOfflineTtsSampleRate func(tts uintptr) int32
|
||||
sherpaOfflineTtsSampleRate func(tts uintptr) int32
|
||||
|
||||
// Offline speaker diarization. Result handle owns the segment-array
|
||||
// pointer returned by ResultSortByStartTime; destroy the segment
|
||||
// array first, then the result, then (at backend Free()) the diarizer.
|
||||
sherpaDestroyOfflineSpeakerDiarization func(sd uintptr)
|
||||
sherpaOfflineSpeakerDiarizationGetSampleRate func(sd uintptr) int32
|
||||
sherpaOfflineSpeakerDiarizationProcess func(sd uintptr, samples unsafe.Pointer, n int32) uintptr
|
||||
sherpaOfflineSpeakerDiarizationResultGetNumSegments func(result uintptr) int32
|
||||
sherpaOfflineSpeakerDiarizationResultGetNumSpeakers func(result uintptr) int32
|
||||
sherpaOfflineSpeakerDiarizationResultSortByStartTime func(result uintptr) uintptr
|
||||
sherpaOfflineSpeakerDiarizationDestroySegment func(segs uintptr)
|
||||
sherpaDestroyOfflineSpeakerDiarizationResult func(result uintptr)
|
||||
sherpaDestroyOfflineSpeakerDiarization func(sd uintptr)
|
||||
sherpaOfflineSpeakerDiarizationGetSampleRate func(sd uintptr) int32
|
||||
sherpaOfflineSpeakerDiarizationProcess func(sd uintptr, samples unsafe.Pointer, n int32) uintptr
|
||||
sherpaOfflineSpeakerDiarizationResultGetNumSegments func(result uintptr) int32
|
||||
sherpaOfflineSpeakerDiarizationResultGetNumSpeakers func(result uintptr) int32
|
||||
sherpaOfflineSpeakerDiarizationResultSortByStartTime func(result uintptr) uintptr
|
||||
sherpaOfflineSpeakerDiarizationDestroySegment func(segs uintptr)
|
||||
sherpaDestroyOfflineSpeakerDiarizationResult func(result uintptr)
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -278,6 +286,14 @@ func loadSherpaLibsOnce() error {
|
||||
{&shimTtsConfigSetDebug, "sherpa_shim_tts_config_set_debug"},
|
||||
{&shimTtsConfigSetProvider, "sherpa_shim_tts_config_set_provider"},
|
||||
{&shimTtsConfigSetMaxNumSentences, "sherpa_shim_tts_config_set_max_num_sentences"},
|
||||
{&shimTtsConfigSetKokoroModel, "sherpa_shim_tts_config_set_kokoro_model"},
|
||||
{&shimTtsConfigSetKokoroVoices, "sherpa_shim_tts_config_set_kokoro_voices"},
|
||||
{&shimTtsConfigSetKokoroTokens, "sherpa_shim_tts_config_set_kokoro_tokens"},
|
||||
{&shimTtsConfigSetKokoroDataDir, "sherpa_shim_tts_config_set_kokoro_data_dir"},
|
||||
{&shimTtsConfigSetKokoroDictDir, "sherpa_shim_tts_config_set_kokoro_dict_dir"},
|
||||
{&shimTtsConfigSetKokoroLexicon, "sherpa_shim_tts_config_set_kokoro_lexicon"},
|
||||
{&shimTtsConfigSetKokoroLang, "sherpa_shim_tts_config_set_kokoro_lang"},
|
||||
{&shimTtsConfigSetKokoroLengthScale, "sherpa_shim_tts_config_set_kokoro_length_scale"},
|
||||
{&shimCreateOfflineTts, "sherpa_shim_create_offline_tts"},
|
||||
|
||||
{&shimOfflineRecogConfigNew, "sherpa_shim_offline_recog_config_new"},
|
||||
@@ -688,21 +704,14 @@ func (s *SherpaBackend) loadTTS(opts *pb.ModelOptions) error {
|
||||
cfg := shimTtsConfigNew()
|
||||
defer shimTtsConfigFree(cfg)
|
||||
|
||||
shimTtsConfigSetVitsModel(cfg, modelFile)
|
||||
|
||||
if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) {
|
||||
shimTtsConfigSetVitsTokens(cfg, tokensPath)
|
||||
// Kokoro models ship a voices style file alongside the ONNX, whereas
|
||||
// VITS/Piper voices do not. That presence is what tells the two model
|
||||
// families apart, since both arrive as a plain *.onnx in modelDir.
|
||||
if isKokoroModel(modelDir) {
|
||||
s.configureKokoroTTS(cfg, opts, modelFile, modelDir)
|
||||
} else {
|
||||
s.configureVitsTTS(cfg, opts, modelFile, modelDir)
|
||||
}
|
||||
if lexiconPath := filepath.Join(modelDir, "lexicon.txt"); fileExists(lexiconPath) {
|
||||
shimTtsConfigSetVitsLexicon(cfg, lexiconPath)
|
||||
}
|
||||
if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) {
|
||||
shimTtsConfigSetVitsDataDir(cfg, dataDir)
|
||||
}
|
||||
|
||||
shimTtsConfigSetVitsNoiseScale(cfg, findOptionFloat(opts, optionTtsNoiseScale, 0.667))
|
||||
shimTtsConfigSetVitsNoiseScaleW(cfg, findOptionFloat(opts, optionTtsNoiseScaleW, 0.8))
|
||||
shimTtsConfigSetVitsLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0))
|
||||
|
||||
threads := int32(1)
|
||||
if opts.Threads != 0 {
|
||||
@@ -723,6 +732,80 @@ func (s *SherpaBackend) loadTTS(opts *pb.ModelOptions) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// kokoroVoicesFile is the speaker-style bank that ships with Kokoro models and
|
||||
// is absent from VITS/Piper voices; its presence is how loadTTS tells them apart.
|
||||
const kokoroVoicesFile = "voices.bin"
|
||||
|
||||
// isKokoroModel reports whether modelDir holds a Kokoro model (a voices file
|
||||
// next to the ONNX) rather than a VITS/Piper single-speaker model.
|
||||
func isKokoroModel(modelDir string) bool {
|
||||
return fileExists(filepath.Join(modelDir, kokoroVoicesFile))
|
||||
}
|
||||
|
||||
// configureVitsTTS wires a VITS/Piper single-speaker model into cfg: the ONNX
|
||||
// plus the optional tokens, lexicon and espeak-ng-data found beside it.
|
||||
func (s *SherpaBackend) configureVitsTTS(cfg uintptr, opts *pb.ModelOptions, modelFile, modelDir string) {
|
||||
shimTtsConfigSetVitsModel(cfg, modelFile)
|
||||
|
||||
if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) {
|
||||
shimTtsConfigSetVitsTokens(cfg, tokensPath)
|
||||
}
|
||||
if lexiconPath := filepath.Join(modelDir, "lexicon.txt"); fileExists(lexiconPath) {
|
||||
shimTtsConfigSetVitsLexicon(cfg, lexiconPath)
|
||||
}
|
||||
if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) {
|
||||
shimTtsConfigSetVitsDataDir(cfg, dataDir)
|
||||
}
|
||||
|
||||
shimTtsConfigSetVitsNoiseScale(cfg, findOptionFloat(opts, optionTtsNoiseScale, 0.667))
|
||||
shimTtsConfigSetVitsNoiseScaleW(cfg, findOptionFloat(opts, optionTtsNoiseScaleW, 0.8))
|
||||
shimTtsConfigSetVitsLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0))
|
||||
}
|
||||
|
||||
// configureKokoroTTS wires a Kokoro model into cfg: the ONNX, its voices bank,
|
||||
// tokens, and the optional espeak-ng-data / jieba dict / lexicon assets the
|
||||
// multi-lingual packs ship. A language hint comes from the `language=` option.
|
||||
func (s *SherpaBackend) configureKokoroTTS(cfg uintptr, opts *pb.ModelOptions, modelFile, modelDir string) {
|
||||
shimTtsConfigSetKokoroModel(cfg, modelFile)
|
||||
shimTtsConfigSetKokoroVoices(cfg, filepath.Join(modelDir, kokoroVoicesFile))
|
||||
|
||||
if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) {
|
||||
shimTtsConfigSetKokoroTokens(cfg, tokensPath)
|
||||
}
|
||||
if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) {
|
||||
shimTtsConfigSetKokoroDataDir(cfg, dataDir)
|
||||
}
|
||||
if dictDir := filepath.Join(modelDir, "dict"); dirExists(dictDir) {
|
||||
shimTtsConfigSetKokoroDictDir(cfg, dictDir)
|
||||
}
|
||||
|
||||
// Multi-lingual Kokoro ships per-language lexicons; the C API takes them as
|
||||
// a single comma-separated list. US and GB English overlap almost entirely,
|
||||
// so pass only one (US preferred) to avoid tens of thousands of "duplicated
|
||||
// word" warnings at load; non-English lexicons (e.g. zh) are additive.
|
||||
var lexicons []string
|
||||
addLexicon := func(name string) {
|
||||
if p := filepath.Join(modelDir, name); fileExists(p) {
|
||||
lexicons = append(lexicons, p)
|
||||
}
|
||||
}
|
||||
if fileExists(filepath.Join(modelDir, "lexicon-us-en.txt")) {
|
||||
addLexicon("lexicon-us-en.txt")
|
||||
} else {
|
||||
addLexicon("lexicon-gb-en.txt")
|
||||
}
|
||||
addLexicon("lexicon-zh.txt")
|
||||
addLexicon("lexicon.txt")
|
||||
if len(lexicons) > 0 {
|
||||
shimTtsConfigSetKokoroLexicon(cfg, strings.Join(lexicons, ","))
|
||||
}
|
||||
|
||||
if lang := findOptionValue(opts, optionLanguage, ""); lang != "" {
|
||||
shimTtsConfigSetKokoroLang(cfg, lang)
|
||||
}
|
||||
shimTtsConfigSetKokoroLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0))
|
||||
}
|
||||
|
||||
func fileExists(p string) bool {
|
||||
info, err := os.Stat(p)
|
||||
return err == nil && !info.IsDir()
|
||||
@@ -1252,7 +1335,7 @@ type ttsStreamState struct {
|
||||
var (
|
||||
ttsStates sync.Map // uint64 → *ttsStreamState
|
||||
ttsNextID atomic.Uint64
|
||||
ttsCallbackPtr uintptr // purego.NewCallback return; registered in loadSherpaLibs
|
||||
ttsCallbackPtr uintptr // purego.NewCallback return; registered in loadSherpaLibs
|
||||
)
|
||||
|
||||
// ttsStreamCallback is invoked by sherpa-onnx for each PCM chunk VITS
|
||||
|
||||
@@ -124,6 +124,20 @@ var _ = Describe("Sherpa-ONNX", func() {
|
||||
Entry("empty", "", false),
|
||||
Entry("other", "other", false),
|
||||
)
|
||||
|
||||
It("isKokoroModel detects a voices file beside the ONNX", func() {
|
||||
dir, err := os.MkdirTemp("", "sherpa-kokoro-*")
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
defer func() { _ = os.RemoveAll(dir) }()
|
||||
|
||||
// A bare VITS/Piper directory (ONNX only) is not Kokoro.
|
||||
Expect(os.WriteFile(filepath.Join(dir, "model.onnx"), []byte("x"), 0o600)).To(Succeed())
|
||||
Expect(isKokoroModel(dir)).To(BeFalse())
|
||||
|
||||
// Adding the Kokoro voices bank flips detection on.
|
||||
Expect(os.WriteFile(filepath.Join(dir, kokoroVoicesFile), []byte("x"), 0o600)).To(Succeed())
|
||||
Expect(isKokoroModel(dir)).To(BeTrue())
|
||||
})
|
||||
})
|
||||
|
||||
Context("option parsing", func() {
|
||||
|
||||
@@ -79,6 +79,13 @@ void sherpa_shim_tts_config_free(void *h) {
|
||||
free((char *)c->model.vits.tokens);
|
||||
free((char *)c->model.vits.lexicon);
|
||||
free((char *)c->model.vits.data_dir);
|
||||
free((char *)c->model.kokoro.model);
|
||||
free((char *)c->model.kokoro.voices);
|
||||
free((char *)c->model.kokoro.tokens);
|
||||
free((char *)c->model.kokoro.data_dir);
|
||||
free((char *)c->model.kokoro.dict_dir);
|
||||
free((char *)c->model.kokoro.lexicon);
|
||||
free((char *)c->model.kokoro.lang);
|
||||
free((char *)c->model.provider);
|
||||
free(c);
|
||||
}
|
||||
@@ -117,6 +124,34 @@ void sherpa_shim_tts_config_set_max_num_sentences(void *h, int32_t v) {
|
||||
((SherpaOnnxOfflineTtsConfig *)h)->max_num_sentences = v;
|
||||
}
|
||||
|
||||
// Kokoro multi-speaker / multi-lingual TTS. Distinct ONNX + a voices style
|
||||
// file (voices.bin) instead of VITS' single-speaker graph; espeak-ng-data,
|
||||
// lexicon and a language hint are optional refinements.
|
||||
void sherpa_shim_tts_config_set_kokoro_model(void *h, const char *v) {
|
||||
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.model, v);
|
||||
}
|
||||
void sherpa_shim_tts_config_set_kokoro_voices(void *h, const char *v) {
|
||||
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.voices, v);
|
||||
}
|
||||
void sherpa_shim_tts_config_set_kokoro_tokens(void *h, const char *v) {
|
||||
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.tokens, v);
|
||||
}
|
||||
void sherpa_shim_tts_config_set_kokoro_data_dir(void *h, const char *v) {
|
||||
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.data_dir, v);
|
||||
}
|
||||
void sherpa_shim_tts_config_set_kokoro_dict_dir(void *h, const char *v) {
|
||||
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.dict_dir, v);
|
||||
}
|
||||
void sherpa_shim_tts_config_set_kokoro_lexicon(void *h, const char *v) {
|
||||
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.lexicon, v);
|
||||
}
|
||||
void sherpa_shim_tts_config_set_kokoro_lang(void *h, const char *v) {
|
||||
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.lang, v);
|
||||
}
|
||||
void sherpa_shim_tts_config_set_kokoro_length_scale(void *h, float v) {
|
||||
((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.length_scale = v;
|
||||
}
|
||||
|
||||
void *sherpa_shim_create_offline_tts(void *h) {
|
||||
return (void *)SherpaOnnxCreateOfflineTts(
|
||||
(const SherpaOnnxOfflineTtsConfig *)h);
|
||||
|
||||
@@ -37,7 +37,7 @@ void sherpa_shim_vad_config_set_provider(void *cfg, const char *v);
|
||||
void sherpa_shim_vad_config_set_debug(void *cfg, int32_t v);
|
||||
void *sherpa_shim_create_vad(void *cfg, float buffer_size_seconds);
|
||||
|
||||
// --- Offline TTS config (VITS path — the only TTS family the backend uses) ---
|
||||
// --- Offline TTS config (VITS/Piper and Kokoro model families) ---
|
||||
void *sherpa_shim_tts_config_new(void);
|
||||
void sherpa_shim_tts_config_free(void *cfg);
|
||||
void sherpa_shim_tts_config_set_vits_model(void *cfg, const char *v);
|
||||
@@ -51,6 +51,14 @@ void sherpa_shim_tts_config_set_num_threads(void *cfg, int32_t v);
|
||||
void sherpa_shim_tts_config_set_debug(void *cfg, int32_t v);
|
||||
void sherpa_shim_tts_config_set_provider(void *cfg, const char *v);
|
||||
void sherpa_shim_tts_config_set_max_num_sentences(void *cfg, int32_t v);
|
||||
void sherpa_shim_tts_config_set_kokoro_model(void *cfg, const char *v);
|
||||
void sherpa_shim_tts_config_set_kokoro_voices(void *cfg, const char *v);
|
||||
void sherpa_shim_tts_config_set_kokoro_tokens(void *cfg, const char *v);
|
||||
void sherpa_shim_tts_config_set_kokoro_data_dir(void *cfg, const char *v);
|
||||
void sherpa_shim_tts_config_set_kokoro_dict_dir(void *cfg, const char *v);
|
||||
void sherpa_shim_tts_config_set_kokoro_lexicon(void *cfg, const char *v);
|
||||
void sherpa_shim_tts_config_set_kokoro_lang(void *cfg, const char *v);
|
||||
void sherpa_shim_tts_config_set_kokoro_length_scale(void *cfg, float v);
|
||||
void *sherpa_shim_create_offline_tts(void *cfg);
|
||||
|
||||
// --- Offline recognizer config (Whisper / Paraformer / SenseVoice / Omnilingual) ---
|
||||
|
||||
@@ -2764,6 +2764,171 @@
|
||||
- filename: vits-ljs/lexicon.txt
|
||||
sha256: bdccfc6da71c45c48e2e0056fcf0aab760577c5f959f6c1b5eb3e3e916fd5a0e
|
||||
uri: https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt
|
||||
- name: vits-piper-it_IT-paola-sherpa
|
||||
url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master
|
||||
urls:
|
||||
- https://github.com/k2-fsa/sherpa-onnx
|
||||
- https://huggingface.co/datasets/paolapersico1/Voice-Dataset-Italian
|
||||
description: |
|
||||
Italian (it_IT) single-speaker Piper VITS voice "paola" (medium quality, 22.05 kHz), served through the sherpa-onnx backend with native streaming TTS. Ships espeak-ng phonemization data, so it works for Italian out of the box.
|
||||
license: other
|
||||
icon: https://avatars.githubusercontent.com/u/75781706
|
||||
tags:
|
||||
- vits
|
||||
- piper
|
||||
- text-to-speech
|
||||
- tts
|
||||
- italian
|
||||
- onnx
|
||||
- sherpa-onnx
|
||||
- single-speaker
|
||||
last_checked: "2026-06-13"
|
||||
overrides:
|
||||
known_usecases:
|
||||
- tts
|
||||
parameters:
|
||||
model: vits-piper-it_IT-paola-medium/it_IT-paola-medium.onnx
|
||||
files:
|
||||
- filename: vits-piper-it_IT-paola-medium.tar.bz2
|
||||
sha256: 7541f75778afa164e44e34baaef63befad7698595df26a95ca944b63ef1a16b4
|
||||
uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-it_IT-paola-medium.tar.bz2
|
||||
- name: vits-piper-en_US-amy-sherpa
|
||||
url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master
|
||||
urls:
|
||||
- https://github.com/k2-fsa/sherpa-onnx
|
||||
- https://github.com/MycroftAI/mimic3-voices
|
||||
description: |
|
||||
English (en_US) single-speaker Piper VITS voice "amy" (medium quality, 22.05 kHz), served through the sherpa-onnx backend with native streaming TTS. Ships espeak-ng phonemization data.
|
||||
license: other
|
||||
icon: https://avatars.githubusercontent.com/u/75781706
|
||||
tags:
|
||||
- vits
|
||||
- piper
|
||||
- text-to-speech
|
||||
- tts
|
||||
- english
|
||||
- onnx
|
||||
- sherpa-onnx
|
||||
- single-speaker
|
||||
last_checked: "2026-06-13"
|
||||
overrides:
|
||||
known_usecases:
|
||||
- tts
|
||||
parameters:
|
||||
model: vits-piper-en_US-amy-medium/en_US-amy-medium.onnx
|
||||
files:
|
||||
- filename: vits-piper-en_US-amy-medium.tar.bz2
|
||||
sha256: 9a5d1fc497f85e8022b785bff5f8105203b1e33099ee6265203efc70b0cb0264
|
||||
uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-medium.tar.bz2
|
||||
- name: vits-piper-es_ES-davefx-sherpa
|
||||
url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master
|
||||
urls:
|
||||
- https://github.com/k2-fsa/sherpa-onnx
|
||||
description: |
|
||||
Spanish (es_ES) single-speaker Piper VITS voice "davefx" (medium quality, 22.05 kHz), served through the sherpa-onnx backend with native streaming TTS. Ships espeak-ng phonemization data.
|
||||
license: cc0-1.0
|
||||
icon: https://avatars.githubusercontent.com/u/75781706
|
||||
tags:
|
||||
- vits
|
||||
- piper
|
||||
- text-to-speech
|
||||
- tts
|
||||
- spanish
|
||||
- onnx
|
||||
- sherpa-onnx
|
||||
- single-speaker
|
||||
last_checked: "2026-06-13"
|
||||
overrides:
|
||||
known_usecases:
|
||||
- tts
|
||||
parameters:
|
||||
model: vits-piper-es_ES-davefx-medium/es_ES-davefx-medium.onnx
|
||||
files:
|
||||
- filename: vits-piper-es_ES-davefx-medium.tar.bz2
|
||||
sha256: a3f6beb54a9cb893279f72978a22f807a4d9fc9c7848157b524d5cc7b7f58b22
|
||||
uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-es_ES-davefx-medium.tar.bz2
|
||||
- name: vits-piper-fr_FR-siwis-sherpa
|
||||
url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master
|
||||
urls:
|
||||
- https://github.com/k2-fsa/sherpa-onnx
|
||||
description: |
|
||||
French (fr_FR) single-speaker Piper VITS voice "siwis" (medium quality, 22.05 kHz), served through the sherpa-onnx backend with native streaming TTS. Ships espeak-ng phonemization data.
|
||||
license: cc-by-4.0
|
||||
icon: https://avatars.githubusercontent.com/u/75781706
|
||||
tags:
|
||||
- vits
|
||||
- piper
|
||||
- text-to-speech
|
||||
- tts
|
||||
- french
|
||||
- onnx
|
||||
- sherpa-onnx
|
||||
- single-speaker
|
||||
last_checked: "2026-06-13"
|
||||
overrides:
|
||||
known_usecases:
|
||||
- tts
|
||||
parameters:
|
||||
model: vits-piper-fr_FR-siwis-medium/fr_FR-siwis-medium.onnx
|
||||
files:
|
||||
- filename: vits-piper-fr_FR-siwis-medium.tar.bz2
|
||||
sha256: 375909aa30842b3a4efa10b1beb1d761af792960ae6873b4d53889f96c66195b
|
||||
uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-fr_FR-siwis-medium.tar.bz2
|
||||
- name: vits-piper-de_DE-thorsten-sherpa
|
||||
url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master
|
||||
urls:
|
||||
- https://github.com/k2-fsa/sherpa-onnx
|
||||
description: |
|
||||
German (de_DE) single-speaker Piper VITS voice "thorsten" (medium quality, 22.05 kHz), served through the sherpa-onnx backend with native streaming TTS. Ships espeak-ng phonemization data.
|
||||
license: cc0-1.0
|
||||
icon: https://avatars.githubusercontent.com/u/75781706
|
||||
tags:
|
||||
- vits
|
||||
- piper
|
||||
- text-to-speech
|
||||
- tts
|
||||
- german
|
||||
- onnx
|
||||
- sherpa-onnx
|
||||
- single-speaker
|
||||
last_checked: "2026-06-13"
|
||||
overrides:
|
||||
known_usecases:
|
||||
- tts
|
||||
parameters:
|
||||
model: vits-piper-de_DE-thorsten-medium/de_DE-thorsten-medium.onnx
|
||||
files:
|
||||
- filename: vits-piper-de_DE-thorsten-medium.tar.bz2
|
||||
sha256: 50487d9c95fdf2191f31d2588569381063ba1591dcd4c7d4bdd30f12b2191714
|
||||
uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-de_DE-thorsten-medium.tar.bz2
|
||||
- name: kokoro-multi-lang-v1.0-sherpa
|
||||
url: github:mudler/LocalAI/gallery/sherpa-onnx-tts.yaml@master
|
||||
urls:
|
||||
- https://github.com/k2-fsa/sherpa-onnx
|
||||
- https://huggingface.co/hexgrad/Kokoro-82M
|
||||
description: |
|
||||
Kokoro multi-lingual TTS (v1.0, int8) served through the sherpa-onnx backend with native streaming TTS. A single model covers many languages and speakers (English, Italian, Spanish, French, German and more) via a built-in voices bank; espeak-ng data and per-language lexicons ship with it. Select a speaker with the `voice` parameter (numeric speaker id) and optionally pass `language=` to hint the language.
|
||||
license: apache-2.0
|
||||
icon: https://avatars.githubusercontent.com/u/75781706
|
||||
tags:
|
||||
- kokoro
|
||||
- text-to-speech
|
||||
- tts
|
||||
- multilingual
|
||||
- italian
|
||||
- english
|
||||
- onnx
|
||||
- sherpa-onnx
|
||||
last_checked: "2026-06-13"
|
||||
overrides:
|
||||
known_usecases:
|
||||
- tts
|
||||
parameters:
|
||||
model: kokoro-int8-multi-lang-v1_0/model.int8.onnx
|
||||
files:
|
||||
- filename: kokoro-int8-multi-lang-v1_0.tar.bz2
|
||||
sha256: 75654a84864be26f345f020f4070c2c019e96dd1b7f9bf6e2ffd59efac6aa5a3
|
||||
uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-int8-multi-lang-v1_0.tar.bz2
|
||||
- name: voxcpm-1.5
|
||||
url: github:mudler/LocalAI/gallery/virtual.yaml@master
|
||||
urls:
|
||||
|
||||
Reference in New Issue
Block a user