mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-14 11:49:33 -04:00
* feat(qwen3-tts-cpp): repoint upstream to ServeurpersoCom/qwentts.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): flatten qt_* ABI into qt3_* purego shim Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): build shim against upstream qwen-core static lib Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): add option/language/voice/sampling parsing Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): add 24kHz WAV encode/decode/stream-header helpers Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): purego backend with streaming, speakers, voice design Map TTSRequest onto qwentts.cpp: instructions->instruct, voice->named speaker or clone-reference path, params map->ref_text + sampling. Add TTSStream over the qt chunk callback. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * test(qwen3-tts-cpp): unit specs + build-gated TTS/TTSStream e2e Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * fix(qwen3-tts-cpp): close defensive PCM-free gap on zero-sample result Register CppPCMFree before the n<=0 guard so a non-null buffer with zero samples cannot leak (the C contract returns NULL on failure, so this is defensive). Raised in code review. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): advertise TTSStream capability Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * chore(qwen3-tts-cpp): update backend index metadata for qwentts.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(gallery): qwentts.cpp models - base/customvoice/voicedesign, Q8_0 & Q4_K_M Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * docs(qwen3-tts-cpp): release note for qwentts.cpp migration Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * test(qwen3-tts-cpp): cover audio_path voice-cloning fallback Add resolveRequest unit specs (config audio_path used as the clone reference when Voice is empty; per-request audio Voice overrides it; a named-speaker Voice does not trigger cloning) plus a real-inference e2e that clones from audio_path (confirmed ref_spk_emb=yes in the pipeline). Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * chore(qwen3-tts-cpp): drop the release-note doc Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
129 lines
3.5 KiB
Go
129 lines
3.5 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/binary"
|
|
"fmt"
|
|
"os"
|
|
"runtime"
|
|
|
|
"github.com/go-audio/audio"
|
|
"github.com/go-audio/wav"
|
|
)
|
|
|
|
const qwen3ttsSampleRate = 24000
|
|
|
|
// wavHeader24k returns a 44-byte WAV header for a streaming 24 kHz mono 16-bit
|
|
// PCM stream, with placeholder (0xFFFFFFFF) sizes since the total length is
|
|
// unknown up front. Emitted as the first chunk of TTSStream so the HTTP layer
|
|
// receives a self-describing WAV (the gRPC TTSStream path never sets Message,
|
|
// so the backend owns the header - see core/backend/tts.go:ModelTTSStream).
|
|
func wavHeader24k() []byte {
|
|
var buf bytes.Buffer
|
|
w := func(v any) { _ = binary.Write(&buf, binary.LittleEndian, v) }
|
|
buf.WriteString("RIFF")
|
|
w(uint32(0xFFFFFFFF))
|
|
buf.WriteString("WAVE")
|
|
buf.WriteString("fmt ")
|
|
w(uint32(16)) // Subchunk1Size
|
|
w(uint16(1)) // PCM
|
|
w(uint16(1)) // mono
|
|
w(uint32(qwen3ttsSampleRate)) // sample rate
|
|
w(uint32(qwen3ttsSampleRate * 2)) // byte rate = SR * blockAlign
|
|
w(uint16(2)) // block align (16-bit mono)
|
|
w(uint16(16)) // bits per sample
|
|
buf.WriteString("data")
|
|
w(uint32(0xFFFFFFFF))
|
|
return buf.Bytes()
|
|
}
|
|
|
|
// floatToPCM16LE clamps each sample to [-1,1] and encodes it as little-endian
|
|
// signed 16-bit PCM.
|
|
func floatToPCM16LE(samples []float32) []byte {
|
|
out := make([]byte, len(samples)*2)
|
|
for i, s := range samples {
|
|
if s > 1 {
|
|
s = 1
|
|
} else if s < -1 {
|
|
s = -1
|
|
}
|
|
v := int16(s * 32767)
|
|
out[i*2] = byte(v)
|
|
out[i*2+1] = byte(v >> 8)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// writeWAV24k writes samples as a finalized 24 kHz mono 16-bit WAV at dst.
|
|
func writeWAV24k(dst string, samples []float32) error {
|
|
f, err := os.Create(dst)
|
|
if err != nil {
|
|
return fmt.Errorf("qwen3-tts: create %q: %w", dst, err)
|
|
}
|
|
enc := wav.NewEncoder(f, qwen3ttsSampleRate, 16, 1, 1)
|
|
ints := make([]int, len(samples))
|
|
for i, s := range samples {
|
|
if s > 1 {
|
|
s = 1
|
|
} else if s < -1 {
|
|
s = -1
|
|
}
|
|
ints[i] = int(s * 32767)
|
|
}
|
|
b := &audio.IntBuffer{
|
|
Format: &audio.Format{NumChannels: 1, SampleRate: qwen3ttsSampleRate},
|
|
Data: ints,
|
|
SourceBitDepth: 16,
|
|
}
|
|
if err := enc.Write(b); err != nil {
|
|
_ = enc.Close()
|
|
_ = f.Close()
|
|
return fmt.Errorf("qwen3-tts: encode WAV: %w", err)
|
|
}
|
|
if err := enc.Close(); err != nil {
|
|
_ = f.Close()
|
|
return fmt.Errorf("qwen3-tts: finalize WAV: %w", err)
|
|
}
|
|
return f.Close()
|
|
}
|
|
|
|
// readWAVAsFloat decodes a WAV file (any sample rate/channels) to a mono
|
|
// float32 slice in [-1,1] for use as cloning reference audio. qwentts expects
|
|
// 24 kHz; callers should supply 24 kHz reference clips.
|
|
func readWAVAsFloat(path string) ([]float32, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("qwen3-tts: open ref %q: %w", path, err)
|
|
}
|
|
defer func() { _ = f.Close() }()
|
|
|
|
dec := wav.NewDecoder(f)
|
|
buf, err := dec.FullPCMBuffer()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("qwen3-tts: decode ref %q: %w", path, err)
|
|
}
|
|
ch := int(buf.Format.NumChannels)
|
|
if ch < 1 {
|
|
ch = 1
|
|
}
|
|
bitDepth := int(buf.SourceBitDepth)
|
|
if bitDepth == 0 {
|
|
bitDepth = 16
|
|
}
|
|
scale := float32(int64(1) << uint(bitDepth-1))
|
|
n := len(buf.Data) / ch
|
|
out := make([]float32, n)
|
|
for i := 0; i < n; i++ {
|
|
var acc int
|
|
for c := 0; c < ch; c++ {
|
|
acc += buf.Data[i*ch+c]
|
|
}
|
|
out[i] = float32(acc) / float32(ch) / scale
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// runtimeKeepAlive prevents the GC from reclaiming the reference-audio slice
|
|
// while its backing pointer is in use across the C call.
|
|
func runtimeKeepAlive(v any) { runtime.KeepAlive(v) }
|