mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-14 11:49:33 -04:00
* feat(omnivoice-cpp): add C wrapper + CMake/Makefile build over OmniVoice ov_* ABI Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): add option/language parsing + WAV framing helpers with tests Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): wire purego binding with TTS + streaming TTSStream Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * build(omnivoice-cpp): wire backend into root Makefile Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * ci(omnivoice-cpp): add build matrix entries + dep-bump registration Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): register backend meta + image entries Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): expose as preference-only importable backend Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(gallery): add omnivoice-cpp TTS models (Q8_0 default + BF16 HQ) Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * docs(omnivoice-cpp): document the OmniVoice TTS backend Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * test(omnivoice-cpp): add env-gated e2e for TTS + streaming Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): honor tts.audio_path/tts.voice config as default cloning reference The model config tts.audio_path (ModelOptions.AudioPath) and tts.voice now provide a default voice-cloning reference used when a request omits Voice, so a cloned voice can be pinned in the model YAML instead of passed per request. A per-request voice still overrides. Paths resolve relative to the model dir. Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(omnivoice-cpp): add missing omnivoice-cpp-development backend meta Mirrors the whisper/vibevoice convention: a -development meta aggregating the master-tagged image variants (the production meta and per-variant prod+dev image entries already existed; only the development meta aggregator was missing). Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
247 lines
6.8 KiB
Go
247 lines
6.8 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"unsafe"
|
|
|
|
"github.com/ebitengine/purego"
|
|
"github.com/mudler/LocalAI/pkg/grpc/base"
|
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
)
|
|
|
|
var (
|
|
// omni_load(model_path, codec_path, use_fa, clamp_fp16) int
|
|
CppLoad func(modelPath, codecPath string, useFA, clampFP16 int) int
|
|
// omni_tts(text, lang, instruct, ref_samples, ref_n, ref_text, seed, denoise, out_n) -> float* (uintptr)
|
|
CppTTS func(text, lang, instruct string, refSamples unsafe.Pointer, refN int,
|
|
refText string, seed int64, denoise int, outN unsafe.Pointer) uintptr
|
|
// omni_tts_stream(text, lang, instruct, ref_samples, ref_n, ref_text, seed, denoise, cb, user) int
|
|
CppTTSStream func(text, lang, instruct string, refSamples unsafe.Pointer, refN int,
|
|
refText string, seed int64, denoise int, cb uintptr, user uintptr) int
|
|
CppPCMFree func(ptr uintptr)
|
|
CppUnload func()
|
|
)
|
|
|
|
type OmnivoiceCpp struct {
|
|
base.SingleThread
|
|
opts loadOptions
|
|
// audioPath is the model-config reference voice (tts.audio_path), used as
|
|
// the default voice-cloning reference when a request does not set Voice.
|
|
audioPath string
|
|
}
|
|
|
|
func (o *OmnivoiceCpp) Load(opts *pb.ModelOptions) error {
|
|
model := opts.ModelFile
|
|
if model == "" {
|
|
model = opts.ModelPath
|
|
}
|
|
if !filepath.IsAbs(model) && opts.ModelPath != "" {
|
|
model = filepath.Join(opts.ModelPath, model)
|
|
}
|
|
|
|
o.opts = parseOptions(opts.Options)
|
|
|
|
// Resolve the codec/tokenizer GGUF: explicit option, else auto-discover a
|
|
// *tokenizer*.gguf sibling of the base model.
|
|
codec := o.opts.codecPath
|
|
if codec != "" && !filepath.IsAbs(codec) {
|
|
codec = filepath.Join(filepath.Dir(model), codec)
|
|
}
|
|
if codec == "" {
|
|
codec = discoverTokenizer(filepath.Dir(model))
|
|
}
|
|
if codec == "" {
|
|
return fmt.Errorf("omnivoice: no codec/tokenizer GGUF found; set option 'tokenizer:<file>'")
|
|
}
|
|
o.opts.codecPath = codec
|
|
|
|
// tts.audio_path (ModelOptions.AudioPath) is the config-level voice-cloning
|
|
// reference: a default reference WAV used when a request omits Voice.
|
|
// Resolved relative to the model directory like the codec.
|
|
o.audioPath = opts.AudioPath
|
|
if o.audioPath != "" && !filepath.IsAbs(o.audioPath) {
|
|
o.audioPath = filepath.Join(filepath.Dir(model), o.audioPath)
|
|
}
|
|
|
|
useFA := boolToInt(o.opts.useFA)
|
|
clamp := boolToInt(o.opts.clampFP16)
|
|
|
|
fmt.Fprintf(os.Stderr, "[omnivoice-cpp] Load model=%s codec=%s use_fa=%d clamp_fp16=%d\n",
|
|
model, codec, useFA, clamp)
|
|
|
|
if rc := CppLoad(model, codec, useFA, clamp); rc != 0 {
|
|
return fmt.Errorf("omnivoice: failed to load model (rc=%d)", rc)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// discoverTokenizer returns the first *tokenizer*.gguf in dir, or "".
|
|
func discoverTokenizer(dir string) string {
|
|
entries, err := os.ReadDir(dir)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
for _, e := range entries {
|
|
name := strings.ToLower(e.Name())
|
|
if strings.Contains(name, "tokenizer") && strings.HasSuffix(name, ".gguf") {
|
|
return filepath.Join(dir, e.Name())
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func boolToInt(b bool) int {
|
|
if b {
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// refAudio loads the reference WAV (voice cloning) if voice points to a file.
|
|
// Returns nil if no cloning (empty or non-path - voice design uses Instructions).
|
|
func (o *OmnivoiceCpp) refAudio(voice string) ([]float32, error) {
|
|
v := strings.TrimSpace(voice)
|
|
if v == "" {
|
|
return nil, nil
|
|
}
|
|
if _, err := os.Stat(v); err != nil {
|
|
return nil, nil
|
|
}
|
|
return readWAVAsFloat(v)
|
|
}
|
|
|
|
// refAudioFor resolves the cloning reference for a request: the per-request
|
|
// Voice takes precedence, falling back to the model-config audio_path. Empty
|
|
// result means no cloning (voice design via Instructions still applies).
|
|
func (o *OmnivoiceCpp) refAudioFor(req *pb.TTSRequest) ([]float32, error) {
|
|
voice := strings.TrimSpace(req.Voice)
|
|
if voice == "" {
|
|
voice = o.audioPath
|
|
}
|
|
return o.refAudio(voice)
|
|
}
|
|
|
|
func reqParam(req *pb.TTSRequest, key string) string {
|
|
if req.Params == nil {
|
|
return ""
|
|
}
|
|
return req.Params[key]
|
|
}
|
|
|
|
func (o *OmnivoiceCpp) seedFor(req *pb.TTSRequest) int64 {
|
|
if s := reqParam(req, "seed"); s != "" {
|
|
var n int64
|
|
if _, err := fmt.Sscan(s, &n); err == nil {
|
|
return n
|
|
}
|
|
}
|
|
return o.opts.seed
|
|
}
|
|
|
|
func optStr(p *string) string {
|
|
if p == nil {
|
|
return ""
|
|
}
|
|
return *p
|
|
}
|
|
|
|
func (o *OmnivoiceCpp) TTS(req *pb.TTSRequest) error {
|
|
if req.Dst == "" {
|
|
return fmt.Errorf("omnivoice: TTS requires a destination path")
|
|
}
|
|
lang := normalizeLanguage(optStr(req.Language))
|
|
instruct := optStr(req.Instructions)
|
|
refText := reqParam(req, "ref_text")
|
|
seed := o.seedFor(req)
|
|
|
|
ref, err := o.refAudioFor(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var refPtr unsafe.Pointer
|
|
if len(ref) > 0 {
|
|
refPtr = unsafe.Pointer(&ref[0])
|
|
}
|
|
|
|
var n int32
|
|
ptr := CppTTS(req.Text, lang, instruct, refPtr, len(ref), refText, seed,
|
|
boolToInt(o.opts.denoise), unsafe.Pointer(&n))
|
|
runtimeKeepAlive(ref)
|
|
if ptr == 0 || n <= 0 {
|
|
return fmt.Errorf("omnivoice: synthesis failed")
|
|
}
|
|
defer CppPCMFree(ptr)
|
|
src := unsafe.Slice((*float32)(unsafe.Pointer(ptr)), int(n)) //nolint:govet // C-allocated PCM, copied out before free
|
|
out := make([]float32, int(n))
|
|
copy(out, src)
|
|
return writeWAV24k(req.Dst, out)
|
|
}
|
|
|
|
// streamState carries the active TTSStream channel to the single shared C
|
|
// callback. base.SingleThread serializes TTS/TTSStream, so one global slot is
|
|
// safe and avoids leaking a purego callback per request (purego callbacks
|
|
// cannot be freed and are capped).
|
|
var (
|
|
streamMu sync.Mutex
|
|
streamChan chan []byte
|
|
streamCbOnce sync.Once
|
|
streamCbPtr uintptr
|
|
)
|
|
|
|
// streamCallback is registered once and forwards each PCM chunk to streamChan.
|
|
func streamCallback(samples *float32, nSamples int32, _ uintptr) uintptr {
|
|
if nSamples <= 0 || samples == nil || streamChan == nil {
|
|
return 1 // continue
|
|
}
|
|
src := unsafe.Slice(samples, int(nSamples))
|
|
cp := make([]float32, int(nSamples)) // copy out of C memory before returning
|
|
copy(cp, src)
|
|
streamChan <- floatToPCM16LE(cp)
|
|
return 1 // continue
|
|
}
|
|
|
|
func (o *OmnivoiceCpp) TTSStream(req *pb.TTSRequest, results chan []byte) error {
|
|
defer close(results)
|
|
if req.Text == "" {
|
|
return fmt.Errorf("omnivoice: TTSStream requires text")
|
|
}
|
|
|
|
streamCbOnce.Do(func() {
|
|
streamCbPtr = purego.NewCallback(streamCallback)
|
|
})
|
|
|
|
lang := normalizeLanguage(optStr(req.Language))
|
|
instruct := optStr(req.Instructions)
|
|
refText := reqParam(req, "ref_text")
|
|
seed := o.seedFor(req)
|
|
|
|
ref, err := o.refAudioFor(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var refPtr unsafe.Pointer
|
|
if len(ref) > 0 {
|
|
refPtr = unsafe.Pointer(&ref[0])
|
|
}
|
|
|
|
// Emit the WAV header first so the HTTP layer gets a self-describing stream.
|
|
results <- wavHeader24k()
|
|
|
|
streamMu.Lock()
|
|
streamChan = results
|
|
rc := CppTTSStream(req.Text, lang, instruct, refPtr, len(ref), refText, seed,
|
|
boolToInt(o.opts.denoise), streamCbPtr, 0)
|
|
streamChan = nil
|
|
streamMu.Unlock()
|
|
runtimeKeepAlive(ref)
|
|
|
|
if rc != 0 {
|
|
return fmt.Errorf("omnivoice: streaming synthesis failed (rc=%d)", rc)
|
|
}
|
|
return nil
|
|
}
|