mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-14 11:49:33 -04:00
* feat(omnivoice-cpp): add C wrapper + CMake/Makefile build over OmniVoice ov_* ABI Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): add option/language parsing + WAV framing helpers with tests Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): wire purego binding with TTS + streaming TTSStream Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * build(omnivoice-cpp): wire backend into root Makefile Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * ci(omnivoice-cpp): add build matrix entries + dep-bump registration Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): register backend meta + image entries Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): expose as preference-only importable backend Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(gallery): add omnivoice-cpp TTS models (Q8_0 default + BF16 HQ) Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * docs(omnivoice-cpp): document the OmniVoice TTS backend Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * test(omnivoice-cpp): add env-gated e2e for TTS + streaming Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): honor tts.audio_path/tts.voice config as default cloning reference The model config tts.audio_path (ModelOptions.AudioPath) and tts.voice now provide a default voice-cloning reference used when a request omits Voice, so a cloned voice can be pinned in the model YAML instead of passed per request. A per-request voice still overrides. Paths resolve relative to the model dir. Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(omnivoice-cpp): add missing omnivoice-cpp-development backend meta Mirrors the whisper/vibevoice convention: a -development meta aggregating the master-tagged image variants (the production meta and per-variant prod+dev image entries already existed; only the development meta aggregator was missing). Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
130 lines
3.6 KiB
Go
130 lines
3.6 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/binary"
|
|
"fmt"
|
|
"os"
|
|
"runtime"
|
|
|
|
"github.com/go-audio/audio"
|
|
"github.com/go-audio/wav"
|
|
)
|
|
|
|
const omnivoiceSampleRate = 24000
|
|
|
|
// wavHeader24k returns a 44-byte WAV header for a streaming 24 kHz mono 16-bit
|
|
// PCM stream, with placeholder (0xFFFFFFFF) sizes since the total length is
|
|
// unknown up front. Emitted as the first chunk of TTSStream so the HTTP layer
|
|
// receives a self-describing WAV (the gRPC TTSStream path never sets Message,
|
|
// so the backend owns the header - see core/backend/tts.go:ModelTTSStream).
|
|
func wavHeader24k() []byte {
|
|
var buf bytes.Buffer
|
|
w := func(v any) { _ = binary.Write(&buf, binary.LittleEndian, v) }
|
|
buf.WriteString("RIFF")
|
|
w(uint32(0xFFFFFFFF))
|
|
buf.WriteString("WAVE")
|
|
buf.WriteString("fmt ")
|
|
w(uint32(16)) // Subchunk1Size
|
|
w(uint16(1)) // PCM
|
|
w(uint16(1)) // mono
|
|
w(uint32(omnivoiceSampleRate)) // sample rate
|
|
w(uint32(omnivoiceSampleRate * 2)) // byte rate = SR * blockAlign
|
|
w(uint16(2)) // block align (16-bit mono)
|
|
w(uint16(16)) // bits per sample
|
|
buf.WriteString("data")
|
|
w(uint32(0xFFFFFFFF))
|
|
return buf.Bytes()
|
|
}
|
|
|
|
// floatToPCM16LE clamps each sample to [-1,1] and encodes it as little-endian
|
|
// signed 16-bit PCM.
|
|
func floatToPCM16LE(samples []float32) []byte {
|
|
out := make([]byte, len(samples)*2)
|
|
for i, s := range samples {
|
|
if s > 1 {
|
|
s = 1
|
|
} else if s < -1 {
|
|
s = -1
|
|
}
|
|
v := int16(s * 32767)
|
|
out[i*2] = byte(v)
|
|
out[i*2+1] = byte(v >> 8)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// writeWAV24k writes samples as a finalized 24 kHz mono 16-bit WAV at dst.
|
|
func writeWAV24k(dst string, samples []float32) error {
|
|
f, err := os.Create(dst)
|
|
if err != nil {
|
|
return fmt.Errorf("omnivoice: create %q: %w", dst, err)
|
|
}
|
|
enc := wav.NewEncoder(f, omnivoiceSampleRate, 16, 1, 1)
|
|
ints := make([]int, len(samples))
|
|
for i, s := range samples {
|
|
if s > 1 {
|
|
s = 1
|
|
} else if s < -1 {
|
|
s = -1
|
|
}
|
|
ints[i] = int(s * 32767)
|
|
}
|
|
b := &audio.IntBuffer{
|
|
Format: &audio.Format{NumChannels: 1, SampleRate: omnivoiceSampleRate},
|
|
Data: ints,
|
|
SourceBitDepth: 16,
|
|
}
|
|
if err := enc.Write(b); err != nil {
|
|
_ = enc.Close()
|
|
_ = f.Close()
|
|
return fmt.Errorf("omnivoice: encode WAV: %w", err)
|
|
}
|
|
if err := enc.Close(); err != nil {
|
|
_ = f.Close()
|
|
return fmt.Errorf("omnivoice: finalize WAV: %w", err)
|
|
}
|
|
return f.Close()
|
|
}
|
|
|
|
// readWAVAsFloat decodes a WAV file (any sample rate/channels) to a mono
|
|
// float32 slice in [-1,1] for use as reference audio. OmniVoice expects 24 kHz;
|
|
// callers should supply 24 kHz reference clips.
|
|
func readWAVAsFloat(path string) ([]float32, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("omnivoice: open ref %q: %w", path, err)
|
|
}
|
|
defer func() { _ = f.Close() }()
|
|
|
|
dec := wav.NewDecoder(f)
|
|
buf, err := dec.FullPCMBuffer()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("omnivoice: decode ref %q: %w", path, err)
|
|
}
|
|
ch := int(buf.Format.NumChannels)
|
|
if ch < 1 {
|
|
ch = 1
|
|
}
|
|
bitDepth := int(buf.SourceBitDepth)
|
|
if bitDepth == 0 {
|
|
bitDepth = 16
|
|
}
|
|
scale := float32(int64(1) << uint(bitDepth-1))
|
|
n := len(buf.Data) / ch
|
|
out := make([]float32, n)
|
|
for i := 0; i < n; i++ {
|
|
// Downmix to mono by averaging channels.
|
|
var acc int
|
|
for c := 0; c < ch; c++ {
|
|
acc += buf.Data[i*ch+c]
|
|
}
|
|
out[i] = float32(acc) / float32(ch) / scale
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// runtimeKeepAlive prevents the GC from reclaiming the reference-audio slice
|
|
// while its backing pointer is in use across the C call.
|
|
func runtimeKeepAlive(v any) { runtime.KeepAlive(v) }
|