Files
LocalAI/backend/go/supertonic/backend.go
LocalAI [bot] 2df2876db2 feat(supertonic): add Supertonic ONNX TTS backend (CPU) (#10342)
* feat(supertonic): vendor upstream Go TTS pipeline (helper.go)

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(supertonic): add gRPC backend (Load/TTS/TTSStream, CPU)

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(supertonic): satisfy unused linter (use onnxProvider; exclude vendored helper.go)

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* test(supertonic): unit tests for resolvers + gated end-to-end synthesis

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* style(supertonic): gofmt backend.go comment block

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(supertonic): add Makefile, run.sh, package.sh (CPU build)

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* build(supertonic): wire backend into root Makefile

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(supertonic): check ort.DestroyEnvironment return (errcheck)

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(supertonic): resolve voice_styles as sibling of onnx dir; guard trim; test voice

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(supertonic): add CPU build matrix + gallery index entries

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(supertonic): expose as pref-only importable backend

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(supertonic): add Supertonic/supertonic-3 TTS model to the gallery

16 files (4 onnx + tts.json + unicode_indexer.json + 10 voice styles)
from HF Supertone/supertonic-3, served via the supertonic backend.
Defaults to voice F1; onnx/ + sibling voice_styles/ layout matches the
backend's resolveVoicesDir.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(meta): register pipeline.max_history_items config field

Pre-existing on master: the field was added without a registry entry,
failing TestAllFieldsHaveRegistryEntries (core/config/meta). Add the
entry so it renders properly in the model-config UI.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* ci(secscan): exclude vendored supertonic backend from gosec

helper.go is vendored from supertone-inc/supertonic; its G304/G404/G104
findings are inherent to upstream and the math/rand use is correct for
flow-matching noise (crypto/rand would be wrong).

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-15 16:54:11 +02:00

308 lines
8.0 KiB
Go

package main
import (
"bytes"
"encoding/binary"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
laudio "github.com/mudler/LocalAI/pkg/audio"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
// onnxProvider is set via -ldflags "-X main.onnxProvider=cuda" by the
// CUDA build (later phase). Defaults to CPU.
var onnxProvider = "cpu"
// Per-model generation defaults, overridable via ModelOptions.Options:
//
// supertonic.steps=<int> denoising steps (quality), default 8
// supertonic.speed=<float> speech rate, default 1.05
// supertonic.silence=<float> inter-chunk silence seconds, default 0.3
// supertonic.default_voice=<name> voice-style used when request omits voice
// supertonic.default_lang=<lang> language tag used when request omits it
const (
optionSteps = "supertonic.steps="
optionSpeed = "supertonic.speed="
optionSilence = "supertonic.silence="
optionDefaultVoice = "supertonic.default_voice="
optionDefaultLang = "supertonic.default_lang="
)
type SupertonicBackend struct {
base.SingleThread
tts *TextToSpeech
cfg Config
modelDir string
voicesDir string
defaultVoice string
defaultLang string
steps int
speed float32
silence float32
styleMu sync.Mutex
styles map[string]*Style // voice name -> loaded style cache
}
func (s *SupertonicBackend) Load(opts *pb.ModelOptions) error {
modelDir, err := resolveModelDir(opts.ModelFile)
if err != nil {
return err
}
s.modelDir = modelDir
s.voicesDir = resolveVoicesDir(modelDir)
cfg, err := LoadCfgs(modelDir)
if err != nil {
return fmt.Errorf("loading tts.json from %s: %w", modelDir, err)
}
s.cfg = cfg
// onnxProvider is "cpu" for the CPU build; the CUDA build sets it to
// "cuda" via -ldflags. Upstream LoadTextToSpeech still errors on GPU
// until the CUDA phase wires the execution provider.
tts, err := LoadTextToSpeech(modelDir, onnxProvider == "cuda", cfg)
if err != nil {
return fmt.Errorf("loading supertonic models from %s: %w", modelDir, err)
}
s.tts = tts
s.steps = int(findOptionInt(opts, optionSteps, 8))
s.speed = findOptionFloat(opts, optionSpeed, 1.05)
s.silence = findOptionFloat(opts, optionSilence, 0.3)
s.defaultVoice = findOptionValue(opts, optionDefaultVoice, "")
s.defaultLang = findOptionValue(opts, optionDefaultLang, "na")
s.styles = map[string]*Style{}
return nil
}
func (s *SupertonicBackend) TTS(req *pb.TTSRequest) error {
wav, sr, err := s.synthesize(req)
if err != nil {
return err
}
out := make([]float64, len(wav))
for i, v := range wav {
out[i] = float64(v)
}
if err := writeWavFile(req.Dst, out, sr); err != nil {
return fmt.Errorf("writing wav to %s: %w", req.Dst, err)
}
return nil
}
func (s *SupertonicBackend) TTSStream(req *pb.TTSRequest, results chan []byte) error {
defer close(results)
wav, sr, err := s.synthesize(req)
if err != nil {
return err
}
results <- streamingWAVHeader(uint32(sr))
const chunkSamples = 4096
for off := 0; off < len(wav); off += chunkSamples {
end := off + chunkSamples
if end > len(wav) {
end = len(wav)
}
results <- pcmFloatToInt16LE(wav[off:end])
}
return nil
}
// synthesize runs the full pipeline and returns the trimmed mono float32
// PCM and its sample rate.
func (s *SupertonicBackend) synthesize(req *pb.TTSRequest) ([]float32, int, error) {
if s.tts == nil {
return nil, 0, fmt.Errorf("supertonic model not loaded")
}
if strings.TrimSpace(req.Text) == "" {
return nil, 0, fmt.Errorf("empty text")
}
style, err := s.loadStyle(s.voiceName(req.Voice))
if err != nil {
return nil, 0, err
}
lang := s.resolveLang("")
if req.Language != nil {
lang = s.resolveLang(*req.Language)
}
wav, dur, err := s.tts.Call(req.Text, lang, style, s.steps, s.speed, s.silence)
if err != nil {
return nil, 0, err
}
sr := s.tts.SampleRate
// Call returns concatenated audio; trim to the reported duration.
wavLen := int(float32(sr) * dur)
if wavLen < 0 {
wavLen = 0
}
if wavLen > len(wav) {
wavLen = len(wav)
}
return wav[:wavLen], sr, nil
}
// voiceName picks the request voice, falling back to the model default.
func (s *SupertonicBackend) voiceName(reqVoice string) string {
v := strings.TrimSpace(reqVoice)
if v == "" {
return s.defaultVoice
}
return v
}
// resolveLang validates against AvailableLangs, falling back to the model
// default (then "na").
func (s *SupertonicBackend) resolveLang(reqLang string) string {
l := strings.TrimSpace(reqLang)
if l != "" && isValidLang(l) {
return l
}
if s.defaultLang != "" && isValidLang(s.defaultLang) {
return s.defaultLang
}
return "na"
}
// loadStyle resolves and caches a voice-style. An empty name with no model
// default is an error (supertonic requires a style embedding).
func (s *SupertonicBackend) loadStyle(name string) (*Style, error) {
if name == "" {
return nil, fmt.Errorf("no voice specified and no supertonic.default_voice set")
}
s.styleMu.Lock()
defer s.styleMu.Unlock()
if st, ok := s.styles[name]; ok {
return st, nil
}
path := s.voiceStylePath(name)
st, err := LoadVoiceStyle([]string{path}, false)
if err != nil {
return nil, fmt.Errorf("loading voice style %q (%s): %w", name, path, err)
}
s.styles[name] = st
return st, nil
}
// voiceStylePath maps a voice name to a JSON path. Absolute paths are honored;
// names containing a separator resolve under modelDir; bare names resolve under
// the resolved voicesDir (see resolveVoicesDir).
func (s *SupertonicBackend) voiceStylePath(name string) string {
if !strings.HasSuffix(name, ".json") {
name += ".json"
}
if filepath.IsAbs(name) {
return name
}
if strings.ContainsRune(name, filepath.Separator) {
return filepath.Join(s.modelDir, name)
}
return filepath.Join(s.voicesDir, name)
}
// resolveVoicesDir locates the voice_styles directory. The HF model layout
// puts the ONNX files in an onnx/ subdir with voice_styles/ as its sibling,
// so check modelDir/voice_styles first, then the parent's voice_styles.
func resolveVoicesDir(modelDir string) string {
candidates := []string{
filepath.Join(modelDir, "voice_styles"),
filepath.Join(filepath.Dir(modelDir), "voice_styles"),
}
for _, c := range candidates {
if info, err := os.Stat(c); err == nil && info.IsDir() {
return c
}
}
return candidates[0]
}
// resolveModelDir accepts either a directory (used as-is) or a file (its
// parent dir is used).
func resolveModelDir(modelFile string) (string, error) {
if modelFile == "" {
return "", fmt.Errorf("empty model path")
}
info, err := os.Stat(modelFile)
if err != nil {
return "", fmt.Errorf("stat model path %s: %w", modelFile, err)
}
if info.IsDir() {
return modelFile, nil
}
return filepath.Dir(modelFile), nil
}
// ---- option helpers (mirrors backend/go/sherpa-onnx/backend.go) ----
func findOptionValue(opts *pb.ModelOptions, prefix, def string) string {
for _, o := range opts.Options {
if strings.HasPrefix(o, prefix) {
return strings.TrimPrefix(o, prefix)
}
}
return def
}
func findOptionFloat(opts *pb.ModelOptions, prefix string, def float32) float32 {
raw := findOptionValue(opts, prefix, "")
if raw == "" {
return def
}
v, err := strconv.ParseFloat(raw, 32)
if err != nil {
return def
}
return float32(v)
}
func findOptionInt(opts *pb.ModelOptions, prefix string, def int32) int32 {
raw := findOptionValue(opts, prefix, "")
if raw == "" {
return def
}
v, err := strconv.ParseInt(raw, 10, 32)
if err != nil {
return def
}
return int32(v)
}
// ---- PCM helpers ----
func pcmFloatToInt16LE(samples []float32) []byte {
buf := make([]byte, len(samples)*2)
for i, f := range samples {
v := int32(f * 32767)
if v > 32767 {
v = 32767
} else if v < -32768 {
v = -32768
}
binary.LittleEndian.PutUint16(buf[2*i:], uint16(int16(v)))
}
return buf
}
func streamingWAVHeader(sampleRate uint32) []byte {
const streamingSize = 0xFFFFFFFF
h := laudio.NewWAVHeaderWithRate(streamingSize, sampleRate)
h.ChunkSize = streamingSize
var buf bytes.Buffer
_ = h.Write(&buf)
return buf.Bytes()
}