Files
LocalAI/backend/go/qwen3-tts-cpp/options.go
LocalAI [bot] 4bb592cf91 feat(qwen3-tts-cpp): migrate to ServeurpersoCom/qwentts.cpp (streaming, speakers, voice design) (#10316)
* feat(qwen3-tts-cpp): repoint upstream to ServeurpersoCom/qwentts.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): flatten qt_* ABI into qt3_* purego shim

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): build shim against upstream qwen-core static lib

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): add option/language/voice/sampling parsing

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): add 24kHz WAV encode/decode/stream-header helpers

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): purego backend with streaming, speakers, voice design

Map TTSRequest onto qwentts.cpp: instructions->instruct, voice->named
speaker or clone-reference path, params map->ref_text + sampling. Add
TTSStream over the qt chunk callback.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* test(qwen3-tts-cpp): unit specs + build-gated TTS/TTSStream e2e

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(qwen3-tts-cpp): close defensive PCM-free gap on zero-sample result

Register CppPCMFree before the n<=0 guard so a non-null buffer with zero
samples cannot leak (the C contract returns NULL on failure, so this is
defensive). Raised in code review.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): advertise TTSStream capability

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* chore(qwen3-tts-cpp): update backend index metadata for qwentts.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(gallery): qwentts.cpp models - base/customvoice/voicedesign, Q8_0 & Q4_K_M

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* docs(qwen3-tts-cpp): release note for qwentts.cpp migration

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* test(qwen3-tts-cpp): cover audio_path voice-cloning fallback

Add resolveRequest unit specs (config audio_path used as the clone
reference when Voice is empty; per-request audio Voice overrides it; a
named-speaker Voice does not trigger cloning) plus a real-inference e2e
that clones from audio_path (confirmed ref_spk_emb=yes in the pipeline).

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* chore(qwen3-tts-cpp): drop the release-note doc

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-13 23:09:59 +02:00

162 lines
4.1 KiB
Go

package main
import (
"strconv"
"strings"
)
// loadOptions holds the parsed model-level options.
type loadOptions struct {
codecPath string
useFA bool
clampFP16 bool
seed int64
}
// sampling holds per-request generation parameters with qt defaults applied.
type sampling struct {
temperature float32
topK int
topP float32
repPen float32
maxNew int
seed int64
}
func splitOption(o string) (key, value string, ok bool) {
i := strings.Index(o, ":")
if i < 0 {
return "", "", false
}
return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
}
func parseBool(v string) bool { return v == "true" || v == "1" }
// parseOptions reads the backend "key:value" option slice. Unknown keys are
// ignored. Defaults: use_fa true (qt default; CPU still uses the F32 chain),
// seed -1 (engine random).
func parseOptions(opts []string) loadOptions {
o := loadOptions{useFA: true, seed: -1}
for _, oo := range opts {
key, value, ok := splitOption(oo)
if !ok {
continue
}
switch key {
case "tokenizer", "codec":
o.codecPath = value
case "use_fa":
o.useFA = parseBool(value)
case "clamp_fp16":
o.clampFP16 = parseBool(value)
case "seed":
if n, err := strconv.ParseInt(value, 10, 64); err == nil {
o.seed = n
}
}
}
return o
}
// languageAliases maps codes / locales / full names to the upstream qwentts
// language names. "auto" (and empty) map to "" so the engine auto-detects.
var languageAliases = map[string]string{
"en": "english", "english": "english",
"zh": "chinese", "chinese": "chinese", "mandarin": "chinese",
"ja": "japanese", "japanese": "japanese",
"ko": "korean", "korean": "korean",
"de": "german", "german": "german",
"fr": "french", "french": "french",
"es": "spanish", "spanish": "spanish",
"it": "italian", "italian": "italian",
"pt": "portuguese", "portuguese": "portuguese",
"ru": "russian", "russian": "russian",
"auto": "",
}
// normalizeLanguage lowercases, trims, strips a region/locale suffix
// (en-US -> en), and resolves to the qwentts language name. Empty stays empty
// (engine auto-detects); an unknown value passes through normalized.
func normalizeLanguage(lang string) string {
lang = strings.ToLower(strings.TrimSpace(lang))
if lang == "" {
return ""
}
if i := strings.IndexAny(lang, "-_."); i >= 0 {
lang = lang[:i]
}
if v, ok := languageAliases[lang]; ok {
return v
}
return lang
}
var refAudioExts = []string{".wav", ".flac", ".mp3", ".ogg", ".m4a"}
// resolveVoice interprets the request Voice field: a value ending in a known
// audio extension is a clone-reference path; anything else is a named speaker
// (custom_voice). Empty input yields no speaker and no reference.
func resolveVoice(voice string) (speaker, refPath string) {
v := strings.TrimSpace(voice)
if v == "" {
return "", ""
}
lower := strings.ToLower(v)
for _, ext := range refAudioExts {
if strings.HasSuffix(lower, ext) {
return "", v
}
}
return v, ""
}
func parseFloat32(v string, def float32) float32 {
if v == "" {
return def
}
f, err := strconv.ParseFloat(v, 32)
if err != nil {
return def
}
return float32(f)
}
func parseInt(v string, def int) int {
if v == "" {
return def
}
n, err := strconv.Atoi(v)
if err != nil {
return def
}
return n
}
func parseInt64(v string, def int64) int64 {
if v == "" {
return def
}
n, err := strconv.ParseInt(v, 10, 64)
if err != nil {
return def
}
return n
}
// parseSampling reads per-request sampling params from the TTSRequest params
// map, applying qt defaults (matching qt_tts_default_params).
func parseSampling(params map[string]string, defaultSeed int64) sampling {
s := sampling{temperature: 0.9, topK: 50, topP: 1.0, repPen: 1.05, maxNew: 2048, seed: defaultSeed}
if params == nil {
return s
}
s.temperature = parseFloat32(params["temperature"], s.temperature)
s.topK = parseInt(params["top_k"], s.topK)
s.topP = parseFloat32(params["top_p"], s.topP)
s.repPen = parseFloat32(params["repetition_penalty"], s.repPen)
s.maxNew = parseInt(params["max_new_tokens"], s.maxNew)
s.seed = parseInt64(params["seed"], s.seed)
return s
}