mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-16 04:38:50 -04:00
feat(supertonic): add Supertonic ONNX TTS backend (CPU) (#10342)
* feat(supertonic): vendor upstream Go TTS pipeline (helper.go) Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(supertonic): add gRPC backend (Load/TTS/TTSStream, CPU) Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(supertonic): satisfy unused linter (use onnxProvider; exclude vendored helper.go) Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * test(supertonic): unit tests for resolvers + gated end-to-end synthesis Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * style(supertonic): gofmt backend.go comment block Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(supertonic): add Makefile, run.sh, package.sh (CPU build) Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * build(supertonic): wire backend into root Makefile Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(supertonic): check ort.DestroyEnvironment return (errcheck) Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(supertonic): resolve voice_styles as sibling of onnx dir; guard trim; test voice Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(supertonic): add CPU build matrix + gallery index entries Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(supertonic): expose as pref-only importable backend Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(supertonic): add Supertonic/supertonic-3 TTS model to the gallery 16 files (4 onnx + tts.json + unicode_indexer.json + 10 voice styles) from HF Supertone/supertonic-3, served via the supertonic backend. Defaults to voice F1; onnx/ + sibling voice_styles/ layout matches the backend's resolveVoicesDir. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(meta): register pipeline.max_history_items config field Pre-existing on master: the field was added without a registry entry, failing TestAllFieldsHaveRegistryEntries (core/config/meta). Add the entry so it renders properly in the model-config UI. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * ci(secscan): exclude vendored supertonic backend from gosec helper.go is vendored from supertone-inc/supertonic; its G304/G404/G104 findings are inherent to upstream and the math/rand use is correct for flow-matching noise (crypto/rand would be wrong). Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
30
.github/backend-matrix.yml
vendored
30
.github/backend-matrix.yml
vendored
@@ -4490,6 +4490,36 @@ include:
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
# supertonic CPU (amd64)
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-supertonic'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "supertonic"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
# supertonic CPU (arm64)
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-supertonic'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "supertonic"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
|
||||
# Darwin matrix (consumed by backend-jobs-darwin).
|
||||
includeDarwin:
|
||||
|
||||
5
.github/workflows/secscan.yaml
vendored
5
.github/workflows/secscan.yaml
vendored
@@ -21,7 +21,10 @@ jobs:
|
||||
uses: securego/gosec@v2.27.1
|
||||
with:
|
||||
# we let the report trigger content trigger a failure using the GitHub Security features.
|
||||
args: '-no-fail -fmt sarif -out results.sarif ./...'
|
||||
# backend/go/supertonic is excluded: it vendors upstream supertone-inc/supertonic
|
||||
# (helper.go), whose findings (G304 model-file loads, G404 math/rand for flow-matching
|
||||
# noise, G104 unhandled errors) are inherent to that upstream code, not ours to rewrite.
|
||||
args: '-no-fail -exclude-dir=backend/go/supertonic -fmt sarif -out results.sarif ./...'
|
||||
- name: Upload SARIF file
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
uses: github/codeql-action/upload-sarif@v4
|
||||
|
||||
@@ -74,6 +74,8 @@ linters:
|
||||
paths:
|
||||
# Upstream whisper.cpp source tree fetched by the whisper backend Makefile.
|
||||
- 'backend/go/whisper/sources'
|
||||
# Vendored upstream supertonic pipeline (supertone-inc/supertonic go/helper.go).
|
||||
- 'backend/go/supertonic/helper.go'
|
||||
- 'docs/'
|
||||
rules:
|
||||
# CLI entry points: kong's `env:"..."` tag is the legitimate env→struct
|
||||
|
||||
7
Makefile
7
Makefile
@@ -1,5 +1,5 @@
|
||||
# Disable parallel execution for backend builds
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio backends/supertonic
|
||||
|
||||
GOCMD=go
|
||||
GOTEST=$(GOCMD) test
|
||||
@@ -595,6 +595,7 @@ test-extra: prepare-test-extra
|
||||
$(MAKE) -C backend/rust/kokoros test
|
||||
$(MAKE) -C backend/go/rfdetr-cpp test
|
||||
$(MAKE) -C backend/go/locate-anything-cpp test
|
||||
$(MAKE) -C backend/go/supertonic test
|
||||
|
||||
##
|
||||
## End-to-end gRPC tests that exercise a built backend container image.
|
||||
@@ -1181,6 +1182,7 @@ BACKEND_VIBEVOICE_CPP = vibevoice-cpp|golang|.|false|true
|
||||
BACKEND_LOCALVQE = localvqe|golang|.|false|true
|
||||
BACKEND_OPUS = opus|golang|.|false|true
|
||||
BACKEND_SHERPA_ONNX = sherpa-onnx|golang|.|false|true
|
||||
BACKEND_SUPERTONIC = supertonic|golang|.|false|true
|
||||
|
||||
# Python backends with root context
|
||||
BACKEND_RERANKERS = rerankers|python|.|false|true
|
||||
@@ -1308,12 +1310,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_KOKOROS)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_RFDETR_CPP)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_SUPERTONIC)))
|
||||
|
||||
# Pattern rule for docker-save targets
|
||||
docker-save-%: backend-images
|
||||
docker save local-ai-backend:$* -o backend-images/$*.tar
|
||||
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-omnivoice-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-omnivoice-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy docker-build-supertonic
|
||||
|
||||
########################################################
|
||||
### Mock Backend for E2E Tests
|
||||
|
||||
4
backend/go/supertonic/.gitignore
vendored
Normal file
4
backend/go/supertonic/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
/supertonic
|
||||
/sources/
|
||||
/backend-assets/
|
||||
/package/
|
||||
62
backend/go/supertonic/Makefile
Normal file
62
backend/go/supertonic/Makefile
Normal file
@@ -0,0 +1,62 @@
|
||||
CURRENT_DIR=$(abspath ./)
|
||||
GOCMD=go
|
||||
|
||||
ONNX_VERSION?=1.24.4
|
||||
ONNX_ARCH?=x64
|
||||
ONNX_OS?=linux
|
||||
|
||||
ifneq (,$(findstring aarch64,$(shell uname -m)))
|
||||
ONNX_ARCH=aarch64
|
||||
endif
|
||||
|
||||
ifeq ($(OS),Darwin)
|
||||
ONNX_OS=osx
|
||||
ifneq (,$(findstring arm64,$(shell uname -m)))
|
||||
ONNX_ARCH=arm64
|
||||
else
|
||||
ONNX_ARCH=x86_64
|
||||
endif
|
||||
endif
|
||||
|
||||
# CUDA 12 ships as -gpu, CUDA 13 as -gpu_cuda13 (underscore). CPU has no suffix.
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
ONNX_PROVIDER=cuda
|
||||
ifeq ($(CUDA_MAJOR_VERSION),13)
|
||||
ONNX_VARIANT=-gpu_cuda13
|
||||
else
|
||||
ONNX_VARIANT=-gpu
|
||||
endif
|
||||
else
|
||||
ONNX_VARIANT=
|
||||
ONNX_PROVIDER=cpu
|
||||
endif
|
||||
|
||||
sources/onnxruntime:
|
||||
mkdir -p sources/onnxruntime
|
||||
curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)$(ONNX_VARIANT)-$(ONNX_VERSION).tgz \
|
||||
-o sources/onnxruntime/onnxruntime.tgz
|
||||
cd sources/onnxruntime && tar -xf onnxruntime.tgz --strip-components=1 && rm onnxruntime.tgz
|
||||
|
||||
backend-assets/lib: sources/onnxruntime
|
||||
mkdir -p backend-assets/lib
|
||||
cp -rfLv sources/onnxruntime/lib/* backend-assets/lib/
|
||||
|
||||
supertonic: backend-assets/lib
|
||||
CGO_ENABLED=1 $(GOCMD) build \
|
||||
-ldflags "$(LD_FLAGS) -X main.onnxProvider=$(ONNX_PROVIDER)" \
|
||||
-tags "$(GO_TAGS)" -o supertonic ./
|
||||
|
||||
package:
|
||||
bash package.sh
|
||||
|
||||
build: supertonic package
|
||||
|
||||
# Tests need only the Go toolchain (gcc); yalue dlopens onnxruntime at
|
||||
# runtime, so no tarball download is required to compile or run unit specs.
|
||||
test:
|
||||
CGO_ENABLED=1 $(GOCMD) test -v -timeout 120s ./...
|
||||
|
||||
clean:
|
||||
rm -rf supertonic sources/ backend-assets/ package/
|
||||
|
||||
.PHONY: build package clean test
|
||||
307
backend/go/supertonic/backend.go
Normal file
307
backend/go/supertonic/backend.go
Normal file
@@ -0,0 +1,307 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
laudio "github.com/mudler/LocalAI/pkg/audio"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
// onnxProvider is set via -ldflags "-X main.onnxProvider=cuda" by the
|
||||
// CUDA build (later phase). Defaults to CPU.
|
||||
var onnxProvider = "cpu"
|
||||
|
||||
// Per-model generation defaults, overridable via ModelOptions.Options:
|
||||
//
|
||||
// supertonic.steps=<int> denoising steps (quality), default 8
|
||||
// supertonic.speed=<float> speech rate, default 1.05
|
||||
// supertonic.silence=<float> inter-chunk silence seconds, default 0.3
|
||||
// supertonic.default_voice=<name> voice-style used when request omits voice
|
||||
// supertonic.default_lang=<lang> language tag used when request omits it
|
||||
const (
|
||||
optionSteps = "supertonic.steps="
|
||||
optionSpeed = "supertonic.speed="
|
||||
optionSilence = "supertonic.silence="
|
||||
optionDefaultVoice = "supertonic.default_voice="
|
||||
optionDefaultLang = "supertonic.default_lang="
|
||||
)
|
||||
|
||||
type SupertonicBackend struct {
|
||||
base.SingleThread
|
||||
|
||||
tts *TextToSpeech
|
||||
cfg Config
|
||||
modelDir string
|
||||
voicesDir string
|
||||
defaultVoice string
|
||||
defaultLang string
|
||||
steps int
|
||||
speed float32
|
||||
silence float32
|
||||
|
||||
styleMu sync.Mutex
|
||||
styles map[string]*Style // voice name -> loaded style cache
|
||||
}
|
||||
|
||||
func (s *SupertonicBackend) Load(opts *pb.ModelOptions) error {
|
||||
modelDir, err := resolveModelDir(opts.ModelFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.modelDir = modelDir
|
||||
s.voicesDir = resolveVoicesDir(modelDir)
|
||||
|
||||
cfg, err := LoadCfgs(modelDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("loading tts.json from %s: %w", modelDir, err)
|
||||
}
|
||||
s.cfg = cfg
|
||||
|
||||
// onnxProvider is "cpu" for the CPU build; the CUDA build sets it to
|
||||
// "cuda" via -ldflags. Upstream LoadTextToSpeech still errors on GPU
|
||||
// until the CUDA phase wires the execution provider.
|
||||
tts, err := LoadTextToSpeech(modelDir, onnxProvider == "cuda", cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("loading supertonic models from %s: %w", modelDir, err)
|
||||
}
|
||||
s.tts = tts
|
||||
|
||||
s.steps = int(findOptionInt(opts, optionSteps, 8))
|
||||
s.speed = findOptionFloat(opts, optionSpeed, 1.05)
|
||||
s.silence = findOptionFloat(opts, optionSilence, 0.3)
|
||||
s.defaultVoice = findOptionValue(opts, optionDefaultVoice, "")
|
||||
s.defaultLang = findOptionValue(opts, optionDefaultLang, "na")
|
||||
s.styles = map[string]*Style{}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *SupertonicBackend) TTS(req *pb.TTSRequest) error {
|
||||
wav, sr, err := s.synthesize(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
out := make([]float64, len(wav))
|
||||
for i, v := range wav {
|
||||
out[i] = float64(v)
|
||||
}
|
||||
if err := writeWavFile(req.Dst, out, sr); err != nil {
|
||||
return fmt.Errorf("writing wav to %s: %w", req.Dst, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *SupertonicBackend) TTSStream(req *pb.TTSRequest, results chan []byte) error {
|
||||
defer close(results)
|
||||
|
||||
wav, sr, err := s.synthesize(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
results <- streamingWAVHeader(uint32(sr))
|
||||
|
||||
const chunkSamples = 4096
|
||||
for off := 0; off < len(wav); off += chunkSamples {
|
||||
end := off + chunkSamples
|
||||
if end > len(wav) {
|
||||
end = len(wav)
|
||||
}
|
||||
results <- pcmFloatToInt16LE(wav[off:end])
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// synthesize runs the full pipeline and returns the trimmed mono float32
|
||||
// PCM and its sample rate.
|
||||
func (s *SupertonicBackend) synthesize(req *pb.TTSRequest) ([]float32, int, error) {
|
||||
if s.tts == nil {
|
||||
return nil, 0, fmt.Errorf("supertonic model not loaded")
|
||||
}
|
||||
if strings.TrimSpace(req.Text) == "" {
|
||||
return nil, 0, fmt.Errorf("empty text")
|
||||
}
|
||||
|
||||
style, err := s.loadStyle(s.voiceName(req.Voice))
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
lang := s.resolveLang("")
|
||||
if req.Language != nil {
|
||||
lang = s.resolveLang(*req.Language)
|
||||
}
|
||||
|
||||
wav, dur, err := s.tts.Call(req.Text, lang, style, s.steps, s.speed, s.silence)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
sr := s.tts.SampleRate
|
||||
// Call returns concatenated audio; trim to the reported duration.
|
||||
wavLen := int(float32(sr) * dur)
|
||||
if wavLen < 0 {
|
||||
wavLen = 0
|
||||
}
|
||||
if wavLen > len(wav) {
|
||||
wavLen = len(wav)
|
||||
}
|
||||
return wav[:wavLen], sr, nil
|
||||
}
|
||||
|
||||
// voiceName picks the request voice, falling back to the model default.
|
||||
func (s *SupertonicBackend) voiceName(reqVoice string) string {
|
||||
v := strings.TrimSpace(reqVoice)
|
||||
if v == "" {
|
||||
return s.defaultVoice
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// resolveLang validates against AvailableLangs, falling back to the model
|
||||
// default (then "na").
|
||||
func (s *SupertonicBackend) resolveLang(reqLang string) string {
|
||||
l := strings.TrimSpace(reqLang)
|
||||
if l != "" && isValidLang(l) {
|
||||
return l
|
||||
}
|
||||
if s.defaultLang != "" && isValidLang(s.defaultLang) {
|
||||
return s.defaultLang
|
||||
}
|
||||
return "na"
|
||||
}
|
||||
|
||||
// loadStyle resolves and caches a voice-style. An empty name with no model
|
||||
// default is an error (supertonic requires a style embedding).
|
||||
func (s *SupertonicBackend) loadStyle(name string) (*Style, error) {
|
||||
if name == "" {
|
||||
return nil, fmt.Errorf("no voice specified and no supertonic.default_voice set")
|
||||
}
|
||||
s.styleMu.Lock()
|
||||
defer s.styleMu.Unlock()
|
||||
if st, ok := s.styles[name]; ok {
|
||||
return st, nil
|
||||
}
|
||||
path := s.voiceStylePath(name)
|
||||
st, err := LoadVoiceStyle([]string{path}, false)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("loading voice style %q (%s): %w", name, path, err)
|
||||
}
|
||||
s.styles[name] = st
|
||||
return st, nil
|
||||
}
|
||||
|
||||
// voiceStylePath maps a voice name to a JSON path. Absolute paths are honored;
|
||||
// names containing a separator resolve under modelDir; bare names resolve under
|
||||
// the resolved voicesDir (see resolveVoicesDir).
|
||||
func (s *SupertonicBackend) voiceStylePath(name string) string {
|
||||
if !strings.HasSuffix(name, ".json") {
|
||||
name += ".json"
|
||||
}
|
||||
if filepath.IsAbs(name) {
|
||||
return name
|
||||
}
|
||||
if strings.ContainsRune(name, filepath.Separator) {
|
||||
return filepath.Join(s.modelDir, name)
|
||||
}
|
||||
return filepath.Join(s.voicesDir, name)
|
||||
}
|
||||
|
||||
// resolveVoicesDir locates the voice_styles directory. The HF model layout
|
||||
// puts the ONNX files in an onnx/ subdir with voice_styles/ as its sibling,
|
||||
// so check modelDir/voice_styles first, then the parent's voice_styles.
|
||||
func resolveVoicesDir(modelDir string) string {
|
||||
candidates := []string{
|
||||
filepath.Join(modelDir, "voice_styles"),
|
||||
filepath.Join(filepath.Dir(modelDir), "voice_styles"),
|
||||
}
|
||||
for _, c := range candidates {
|
||||
if info, err := os.Stat(c); err == nil && info.IsDir() {
|
||||
return c
|
||||
}
|
||||
}
|
||||
return candidates[0]
|
||||
}
|
||||
|
||||
// resolveModelDir accepts either a directory (used as-is) or a file (its
|
||||
// parent dir is used).
|
||||
func resolveModelDir(modelFile string) (string, error) {
|
||||
if modelFile == "" {
|
||||
return "", fmt.Errorf("empty model path")
|
||||
}
|
||||
info, err := os.Stat(modelFile)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("stat model path %s: %w", modelFile, err)
|
||||
}
|
||||
if info.IsDir() {
|
||||
return modelFile, nil
|
||||
}
|
||||
return filepath.Dir(modelFile), nil
|
||||
}
|
||||
|
||||
// ---- option helpers (mirrors backend/go/sherpa-onnx/backend.go) ----
|
||||
|
||||
func findOptionValue(opts *pb.ModelOptions, prefix, def string) string {
|
||||
for _, o := range opts.Options {
|
||||
if strings.HasPrefix(o, prefix) {
|
||||
return strings.TrimPrefix(o, prefix)
|
||||
}
|
||||
}
|
||||
return def
|
||||
}
|
||||
|
||||
func findOptionFloat(opts *pb.ModelOptions, prefix string, def float32) float32 {
|
||||
raw := findOptionValue(opts, prefix, "")
|
||||
if raw == "" {
|
||||
return def
|
||||
}
|
||||
v, err := strconv.ParseFloat(raw, 32)
|
||||
if err != nil {
|
||||
return def
|
||||
}
|
||||
return float32(v)
|
||||
}
|
||||
|
||||
func findOptionInt(opts *pb.ModelOptions, prefix string, def int32) int32 {
|
||||
raw := findOptionValue(opts, prefix, "")
|
||||
if raw == "" {
|
||||
return def
|
||||
}
|
||||
v, err := strconv.ParseInt(raw, 10, 32)
|
||||
if err != nil {
|
||||
return def
|
||||
}
|
||||
return int32(v)
|
||||
}
|
||||
|
||||
// ---- PCM helpers ----
|
||||
|
||||
func pcmFloatToInt16LE(samples []float32) []byte {
|
||||
buf := make([]byte, len(samples)*2)
|
||||
for i, f := range samples {
|
||||
v := int32(f * 32767)
|
||||
if v > 32767 {
|
||||
v = 32767
|
||||
} else if v < -32768 {
|
||||
v = -32768
|
||||
}
|
||||
binary.LittleEndian.PutUint16(buf[2*i:], uint16(int16(v)))
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func streamingWAVHeader(sampleRate uint32) []byte {
|
||||
const streamingSize = 0xFFFFFFFF
|
||||
h := laudio.NewWAVHeaderWithRate(streamingSize, sampleRate)
|
||||
h.ChunkSize = streamingSize
|
||||
var buf bytes.Buffer
|
||||
_ = h.Write(&buf)
|
||||
return buf.Bytes()
|
||||
}
|
||||
86
backend/go/supertonic/backend_test.go
Normal file
86
backend/go/supertonic/backend_test.go
Normal file
@@ -0,0 +1,86 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
var _ = Describe("voiceStylePath", func() {
|
||||
s := &SupertonicBackend{modelDir: "/models/st/onnx", voicesDir: "/models/st/voice_styles"}
|
||||
|
||||
It("resolves a bare name under the resolved voicesDir", func() {
|
||||
Expect(s.voiceStylePath("M1")).To(Equal(filepath.Join("/models/st/voice_styles", "M1.json")))
|
||||
})
|
||||
It("keeps an explicit .json suffix", func() {
|
||||
Expect(s.voiceStylePath("M1.json")).To(Equal(filepath.Join("/models/st/voice_styles", "M1.json")))
|
||||
})
|
||||
It("honors absolute paths", func() {
|
||||
Expect(s.voiceStylePath("/abs/v.json")).To(Equal("/abs/v.json"))
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("resolveVoicesDir", func() {
|
||||
It("prefers voice_styles under modelDir", func() {
|
||||
dir := GinkgoT().TempDir()
|
||||
Expect(os.MkdirAll(filepath.Join(dir, "voice_styles"), 0o755)).To(Succeed())
|
||||
Expect(resolveVoicesDir(dir)).To(Equal(filepath.Join(dir, "voice_styles")))
|
||||
})
|
||||
It("falls back to the sibling voice_styles next to an onnx subdir", func() {
|
||||
root := GinkgoT().TempDir()
|
||||
Expect(os.MkdirAll(filepath.Join(root, "voice_styles"), 0o755)).To(Succeed())
|
||||
Expect(os.MkdirAll(filepath.Join(root, "onnx"), 0o755)).To(Succeed())
|
||||
Expect(resolveVoicesDir(filepath.Join(root, "onnx"))).To(Equal(filepath.Join(root, "voice_styles")))
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("resolveLang", func() {
|
||||
It("accepts a valid request language", func() {
|
||||
s := &SupertonicBackend{defaultLang: "na"}
|
||||
Expect(s.resolveLang("ko")).To(Equal("ko"))
|
||||
})
|
||||
It("falls back to the model default for an invalid language", func() {
|
||||
s := &SupertonicBackend{defaultLang: "en"}
|
||||
Expect(s.resolveLang("zz")).To(Equal("en"))
|
||||
})
|
||||
It("falls back to na when nothing is valid", func() {
|
||||
s := &SupertonicBackend{defaultLang: ""}
|
||||
Expect(s.resolveLang("")).To(Equal("na"))
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("pcmFloatToInt16LE", func() {
|
||||
It("clamps and encodes little-endian", func() {
|
||||
out := pcmFloatToInt16LE([]float32{0, 1.0, -1.0, 2.0})
|
||||
Expect(out).To(HaveLen(8))
|
||||
Expect(out[0:2]).To(Equal([]byte{0x00, 0x00})) // 0
|
||||
Expect(out[2:4]).To(Equal([]byte{0xff, 0x7f})) // 32767
|
||||
Expect(out[6:8]).To(Equal([]byte{0xff, 0x7f})) // clamp 2.0 -> 32767
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("end-to-end synthesis", Ordered, func() {
|
||||
var modelDir string
|
||||
BeforeAll(func() {
|
||||
modelDir = os.Getenv("SUPERTONIC_MODEL_PATH")
|
||||
if modelDir == "" {
|
||||
Skip("set SUPERTONIC_MODEL_PATH to a supertonic model dir to run")
|
||||
}
|
||||
Expect(InitializeONNXRuntime()).To(Succeed())
|
||||
})
|
||||
|
||||
It("synthesizes a wav file", func() {
|
||||
b := &SupertonicBackend{}
|
||||
Expect(b.Load(&pb.ModelOptions{ModelFile: modelDir, Options: []string{"supertonic.default_voice=F1"}})).To(Succeed())
|
||||
dst := filepath.Join(GinkgoT().TempDir(), "out.wav")
|
||||
lang := "en"
|
||||
Expect(b.TTS(&pb.TTSRequest{Text: "Hello from LocalAI.", Dst: dst, Language: &lang})).To(Succeed())
|
||||
info, err := os.Stat(dst)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(info.Size()).To(BeNumerically(">", 44)) // header + PCM
|
||||
})
|
||||
})
|
||||
1085
backend/go/supertonic/helper.go
Normal file
1085
backend/go/supertonic/helper.go
Normal file
File diff suppressed because it is too large
Load Diff
27
backend/go/supertonic/main.go
Normal file
27
backend/go/supertonic/main.go
Normal file
@@ -0,0 +1,27 @@
|
||||
package main
|
||||
|
||||
// Started internally by LocalAI; a server is allocated per model.
|
||||
|
||||
import (
|
||||
"flag"
|
||||
|
||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
ort "github.com/yalue/onnxruntime_go"
|
||||
)
|
||||
|
||||
var addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
// InitializeONNXRuntime reads ONNXRUNTIME_LIB_PATH (set by run.sh) and
|
||||
// dlopens libonnxruntime before any session is created in Load().
|
||||
if err := InitializeONNXRuntime(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer func() { _ = ort.DestroyEnvironment() }()
|
||||
|
||||
if err := grpc.StartServer(*addr, &SupertonicBackend{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
13
backend/go/supertonic/main_suite_test.go
Normal file
13
backend/go/supertonic/main_suite_test.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
func TestSupertonic(t *testing.T) {
|
||||
RegisterFailHandler(Fail)
|
||||
RunSpecs(t, "Supertonic backend test suite")
|
||||
}
|
||||
49
backend/go/supertonic/package.sh
Executable file
49
backend/go/supertonic/package.sh
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
REPO_ROOT="${CURDIR}/../../.."
|
||||
|
||||
mkdir -p $CURDIR/package/lib
|
||||
|
||||
cp -avf $CURDIR/supertonic $CURDIR/package/
|
||||
cp -avf $CURDIR/run.sh $CURDIR/package/
|
||||
cp -rfLv $CURDIR/backend-assets/lib/* $CURDIR/package/lib/
|
||||
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||
else
|
||||
echo "Error: Could not detect architecture"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||
package_gpu_libs
|
||||
fi
|
||||
|
||||
echo "Packaging completed successfully"
|
||||
ls -liah $CURDIR/package/
|
||||
ls -liah $CURDIR/package/lib/
|
||||
14
backend/go/supertonic/run.sh
Executable file
14
backend/go/supertonic/run.sh
Executable file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
|
||||
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
|
||||
export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
|
||||
|
||||
if [ -f $CURDIR/lib/ld.so ]; then
|
||||
echo "Using lib/ld.so"
|
||||
exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
|
||||
fi
|
||||
|
||||
exec $CURDIR/supertonic "$@"
|
||||
@@ -1368,6 +1368,20 @@
|
||||
nvidia: "cuda12-sherpa-onnx"
|
||||
nvidia-cuda-12: "cuda12-sherpa-onnx"
|
||||
metal: "metal-sherpa-onnx"
|
||||
- &supertonic
|
||||
name: "supertonic"
|
||||
alias: "supertonic"
|
||||
urls:
|
||||
- https://github.com/supertone-inc/supertonic
|
||||
description: |
|
||||
Supertonic backend: lightning-fast, on-device multilingual text-to-speech via ONNX Runtime.
|
||||
Runs Supertone's flow-matching TTS model (Supertone/supertonic-3), 44.1kHz output, 31 languages,
|
||||
multiple preset voice styles. No espeak-ng dependency.
|
||||
tags:
|
||||
- text-to-speech
|
||||
- TTS
|
||||
capabilities:
|
||||
default: "cpu-supertonic"
|
||||
- !!merge <<: *neutts
|
||||
name: "neutts-development"
|
||||
capabilities:
|
||||
@@ -5132,3 +5146,18 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-sherpa-onnx"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-sherpa-onnx
|
||||
## supertonic
|
||||
- !!merge <<: *supertonic
|
||||
name: "supertonic-development"
|
||||
capabilities:
|
||||
default: "cpu-supertonic-development"
|
||||
- !!merge <<: *supertonic
|
||||
name: "cpu-supertonic"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-supertonic"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-cpu-supertonic
|
||||
- !!merge <<: *supertonic
|
||||
name: "cpu-supertonic-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-supertonic"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-supertonic
|
||||
|
||||
@@ -434,6 +434,13 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
||||
Component: "json-editor",
|
||||
Order: 78,
|
||||
},
|
||||
"pipeline.max_history_items": {
|
||||
Section: "pipeline",
|
||||
Label: "Max History Items",
|
||||
Description: "Cap how many trailing conversation items are fed to the LLM each realtime turn (0 = unlimited, rely on the LLM's context window). Set it on a composed pipeline (VAD+STT+LLM+TTS) so a long-running session doesn't grow until the context fills. Unset uses the per-model-type default.",
|
||||
Component: "number",
|
||||
Order: 79,
|
||||
},
|
||||
|
||||
// --- Functions ---
|
||||
"function.grammar.parallel_calls": {
|
||||
|
||||
@@ -38,6 +38,7 @@ var knownPrefOnlyBackends = []schema.KnownBackend{
|
||||
{Name: "qwen3-tts-cpp", Modality: "tts", AutoDetect: false, Description: "Qwen3 TTS C++ (preference-only)"},
|
||||
{Name: "omnivoice-cpp", Modality: "tts", AutoDetect: false, Description: "OmniVoice C++ TTS with voice cloning and voice design (preference-only)"},
|
||||
{Name: "faster-qwen3-tts", Modality: "tts", AutoDetect: false, Description: "Faster Qwen3 TTS (preference-only)"},
|
||||
{Name: "supertonic", Modality: "tts", AutoDetect: false, Description: "Supertonic multilingual ONNX TTS (preference-only)"},
|
||||
// Detection
|
||||
{Name: "sam3-cpp", Modality: "detection", AutoDetect: false, Description: "SAM3 C++ object detection (preference-only)"},
|
||||
// Audio transform (audio-in / audio-out, optional reference signal)
|
||||
|
||||
@@ -145,6 +145,7 @@ var _ = Describe("Backend Endpoints", func() {
|
||||
expectPrefOnly("qwen-tts", "tts")
|
||||
expectPrefOnly("qwen3-tts-cpp", "tts")
|
||||
expectPrefOnly("faster-qwen3-tts", "tts")
|
||||
expectPrefOnly("supertonic", "tts")
|
||||
expectPrefOnly("sam3-cpp", "detection")
|
||||
})
|
||||
|
||||
|
||||
@@ -3510,6 +3510,78 @@
|
||||
- filename: kokoro-int8-multi-lang-v1_0.tar.bz2
|
||||
sha256: 75654a84864be26f345f020f4070c2c019e96dd1b7f9bf6e2ffd59efac6aa5a3
|
||||
uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-int8-multi-lang-v1_0.tar.bz2
|
||||
- name: supertonic-3
|
||||
url: github:mudler/LocalAI/gallery/supertonic.yaml@master
|
||||
urls:
|
||||
- https://github.com/supertone-inc/supertonic
|
||||
- https://huggingface.co/Supertone/supertonic-3
|
||||
description: |
|
||||
Supertonic multilingual text-to-speech (Supertone/supertonic-3), served through the native supertonic backend via ONNX Runtime. Lightning-fast on-device flow-matching TTS with 44.1 kHz output, 31 languages, and 10 preset voice styles (F1-F5, M1-M5). No espeak-ng dependency. Defaults to voice F1; override per request with the OpenAI `voice` field, and optionally pass `language=` (e.g. en, ko, ja, it; "na" for language-agnostic).
|
||||
license: mit
|
||||
icon: https://huggingface.co/Supertone/supertonic-3/resolve/main/img/Supertonic3_HeroImage.png
|
||||
tags:
|
||||
- text-to-speech
|
||||
- tts
|
||||
- multilingual
|
||||
- onnx
|
||||
- supertonic
|
||||
- flow-matching
|
||||
- multi-speaker
|
||||
last_checked: "2026-06-15"
|
||||
overrides:
|
||||
known_usecases:
|
||||
- tts
|
||||
parameters:
|
||||
model: supertonic-3/onnx/tts.json
|
||||
files:
|
||||
- filename: supertonic-3/onnx/duration_predictor.onnx
|
||||
sha256: c3eb91414d5ff8a7a239b7fe9e34e7e2bf8a8140d8375ffb14718b1c639325db
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/duration_predictor.onnx
|
||||
- filename: supertonic-3/onnx/text_encoder.onnx
|
||||
sha256: c7befd5ea8c3119769e8a6c1486c4edc6a3bc8365c67621c881bbb774b9902ff
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/text_encoder.onnx
|
||||
- filename: supertonic-3/onnx/vector_estimator.onnx
|
||||
sha256: 883ac868ea0275ef0e991524dc64f16b3c0376efd7c320af6b53f5b780d7c61c
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/vector_estimator.onnx
|
||||
- filename: supertonic-3/onnx/vocoder.onnx
|
||||
sha256: 085de76dd8e8d5836d6ca66826601f615939218f90e519f70ee8a36ed2a4c4ba
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/vocoder.onnx
|
||||
- filename: supertonic-3/onnx/tts.json
|
||||
sha256: 42078d3aef1cd43ab43021f3c54f47d2d75ceb4e75f627f118890128b06a0d09
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/tts.json
|
||||
- filename: supertonic-3/onnx/unicode_indexer.json
|
||||
sha256: 9bf7346e43883a81f8645c81224f786d43c5b57f3641f6e7671a7d6c493cb24f
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/unicode_indexer.json
|
||||
- filename: supertonic-3/voice_styles/F1.json
|
||||
sha256: bbdec6ee00231c2c742ad05483df5334cab3b52fda3ba38e6a07059c4563dbc2
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F1.json
|
||||
- filename: supertonic-3/voice_styles/F2.json
|
||||
sha256: 7c722c6a72707b1a77f035d67f0d1351ba187738e06f7683e8c72b1df3477fc6
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F2.json
|
||||
- filename: supertonic-3/voice_styles/F3.json
|
||||
sha256: 12f6ef2573baa2defa1128069cb59f203e3ab67c92af77b42df8a0e3a2f7c6ab
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F3.json
|
||||
- filename: supertonic-3/voice_styles/F4.json
|
||||
sha256: c2fa764c1225a76dfc3e2c73e8aa4f70d9ee48793860eb34c295fff01c2e032b
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F4.json
|
||||
- filename: supertonic-3/voice_styles/F5.json
|
||||
sha256: 45966e73316415626cf41a7d1c6f3b4c70dbc1ba2bee5c1978ef0ce33244fc8d
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F5.json
|
||||
- filename: supertonic-3/voice_styles/M1.json
|
||||
sha256: e35604687f5d23694b8e91593a93eec0e4eca6c0b02bb8ed69139ab2ea6b0a5b
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M1.json
|
||||
- filename: supertonic-3/voice_styles/M2.json
|
||||
sha256: b76cbf62bac707c710cf0ae5aba5e31eea1a6339a9734bfae33ab98499534a50
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M2.json
|
||||
- filename: supertonic-3/voice_styles/M3.json
|
||||
sha256: ea1ac35ccb91b0d7ecad533a2fbd0eec10c91513d8951e3b25fbba99954e159b
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M3.json
|
||||
- filename: supertonic-3/voice_styles/M4.json
|
||||
sha256: ca8eefad4fcd989c9379032ff3e50738adc547eeb5e221b82593a6d7b3bac303
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M4.json
|
||||
- filename: supertonic-3/voice_styles/M5.json
|
||||
sha256: dd22b92740314321f8ae11c5e87f8dd60d060f15dd3a632b5adf77f471f77af2
|
||||
uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M5.json
|
||||
- name: voxcpm-1.5
|
||||
url: github:mudler/LocalAI/gallery/virtual.yaml@master
|
||||
urls:
|
||||
|
||||
19
gallery/supertonic.yaml
Normal file
19
gallery/supertonic.yaml
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
name: "supertonic"
|
||||
|
||||
config_file: |
|
||||
backend: supertonic
|
||||
options:
|
||||
# Generation knobs read by the supertonic backend at TTS time.
|
||||
# steps = flow-matching denoising steps (quality); speed = rate;
|
||||
# silence = inter-chunk silence seconds for long inputs.
|
||||
- supertonic.steps=8
|
||||
- supertonic.speed=1.05
|
||||
- supertonic.silence=0.3
|
||||
# Voice style used when a request omits `voice`. The model ships
|
||||
# F1-F5 / M1-M5 under voice_styles/; override per request via the
|
||||
# OpenAI `voice` field.
|
||||
- supertonic.default_voice=F1
|
||||
# Default language tag when a request omits `language`. "na" is the
|
||||
# model's language-agnostic mode.
|
||||
- supertonic.default_lang=na
|
||||
3
go.mod
3
go.mod
@@ -65,6 +65,7 @@ require (
|
||||
github.com/testcontainers/testcontainers-go/modules/nats v0.42.0
|
||||
github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0
|
||||
github.com/timbutler/zxcvbn v1.0.4
|
||||
github.com/yalue/onnxruntime_go v1.11.0
|
||||
go.opentelemetry.io/otel v1.44.0
|
||||
go.opentelemetry.io/otel/exporters/prometheus v0.66.0
|
||||
go.opentelemetry.io/otel/metric v1.44.0
|
||||
@@ -497,7 +498,7 @@ require (
|
||||
golang.org/x/sync v0.20.0
|
||||
golang.org/x/sys v0.45.0 // indirect
|
||||
golang.org/x/term v0.43.0
|
||||
golang.org/x/text v0.37.0 // indirect
|
||||
golang.org/x/text v0.37.0
|
||||
golang.org/x/tools v0.45.0 // indirect
|
||||
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect
|
||||
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb // indirect
|
||||
|
||||
2
go.sum
2
go.sum
@@ -1377,6 +1377,8 @@ github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavM
|
||||
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
|
||||
github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
|
||||
github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
|
||||
github.com/yalue/onnxruntime_go v1.11.0 h1:aKH4yPIbqfcB3SfnQWq/WxzLelkyolntHnffL3eMBHY=
|
||||
github.com/yalue/onnxruntime_go v1.11.0/go.mod h1:b4X26A8pekNb1ACJ58wAXgNKeUCGEAQ9dmACut9Sm/4=
|
||||
github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4=
|
||||
github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4=
|
||||
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
|
||||
Reference in New Issue
Block a user