Files
LocalAI/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.cpp
LocalAI [bot] 4bb592cf91 feat(qwen3-tts-cpp): migrate to ServeurpersoCom/qwentts.cpp (streaming, speakers, voice design) (#10316)
* feat(qwen3-tts-cpp): repoint upstream to ServeurpersoCom/qwentts.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): flatten qt_* ABI into qt3_* purego shim

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): build shim against upstream qwen-core static lib

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): add option/language/voice/sampling parsing

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): add 24kHz WAV encode/decode/stream-header helpers

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): purego backend with streaming, speakers, voice design

Map TTSRequest onto qwentts.cpp: instructions->instruct, voice->named
speaker or clone-reference path, params map->ref_text + sampling. Add
TTSStream over the qt chunk callback.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* test(qwen3-tts-cpp): unit specs + build-gated TTS/TTSStream e2e

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(qwen3-tts-cpp): close defensive PCM-free gap on zero-sample result

Register CppPCMFree before the n<=0 guard so a non-null buffer with zero
samples cannot leak (the C contract returns NULL on failure, so this is
defensive). Raised in code review.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(qwen3-tts-cpp): advertise TTSStream capability

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* chore(qwen3-tts-cpp): update backend index metadata for qwentts.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(gallery): qwentts.cpp models - base/customvoice/voicedesign, Q8_0 & Q4_K_M

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* docs(qwen3-tts-cpp): release note for qwentts.cpp migration

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* test(qwen3-tts-cpp): cover audio_path voice-cloning fallback

Add resolveRequest unit specs (config audio_path used as the clone
reference when Voice is empty; per-request audio Voice overrides it; a
named-speaker Voice does not trigger cloning) plus a real-inference e2e
that clones from audio_path (confirmed ref_spk_emb=yes in the pipeline).

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* chore(qwen3-tts-cpp): drop the release-note doc

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-13 23:09:59 +02:00

192 lines
6.5 KiB
C++

#include "goqwen3ttscpp.h"
#include "ggml-backend.h"
#include "qwen.h"
#include <cstdio>
#include <cstdlib>
#include <cstring>
static qt_context *g_ctx = nullptr;
static void ggml_log_cb(enum ggml_log_level level, const char *log,
void * /*data*/) {
if (!log)
return;
const char *lvl = "?????";
switch (level) {
case GGML_LOG_LEVEL_DEBUG: lvl = "DEBUG"; break;
case GGML_LOG_LEVEL_INFO: lvl = "INFO"; break;
case GGML_LOG_LEVEL_WARN: lvl = "WARN"; break;
case GGML_LOG_LEVEL_ERROR: lvl = "ERROR"; break;
default: break;
}
fprintf(stderr, "[%-5s] %s", lvl, log);
fflush(stderr);
}
int qt3_load(const char *talker_path, const char *codec_path, int use_fa,
int clamp_fp16) {
ggml_log_set(ggml_log_cb, nullptr);
ggml_backend_load_all();
if (!talker_path || talker_path[0] == '\0') {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: talker_path is required\n");
return 1;
}
if (!codec_path || codec_path[0] == '\0') {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: codec_path is required\n");
return 2;
}
qt_init_params p;
qt_init_default_params(&p);
p.talker_path = talker_path;
p.codec_path = codec_path;
p.use_fa = use_fa != 0;
p.clamp_fp16 = clamp_fp16 != 0;
fprintf(stderr, "[qwen3-tts-cpp] Loading talker=%s codec=%s\n", talker_path,
codec_path);
g_ctx = qt_init(&p);
if (!g_ctx) {
fprintf(stderr, "[qwen3-tts-cpp] FATAL: qt_init failed: %s\n",
qt_last_error());
return 3;
}
fprintf(stderr, "[qwen3-tts-cpp] Model loaded (%s)\n", qt_version());
return 0;
}
// Fill a qt_tts_params from the flat wrapper arguments. Unset/zero scalars keep
// the qt defaults (temperature 0.9, top_k 50, top_p 1.0, rep 1.05, max 2048).
static void fill_params(qt_tts_params *tp, const char *text, const char *lang,
const char *instruct, const char *speaker,
const float *ref_samples, int ref_n,
const char *ref_text, long long seed, float temperature,
int top_k, float top_p, float repetition_penalty,
int max_new_tokens) {
qt_tts_default_params(tp);
tp->text = text ? text : "";
if (lang && lang[0] != '\0')
tp->lang = lang; // else keep default NULL -> auto
if (instruct && instruct[0] != '\0')
tp->instruct = instruct;
if (speaker && speaker[0] != '\0')
tp->speaker = speaker;
if (ref_samples && ref_n > 0) {
tp->ref_audio_24k = ref_samples;
tp->ref_n_samples = ref_n;
if (ref_text && ref_text[0] != '\0')
tp->ref_text = ref_text;
}
if (seed >= 0)
tp->seed = (int64_t)seed; // else default -1 (random)
if (temperature > 0.0f)
tp->temperature = temperature;
if (top_k > 0)
tp->top_k = top_k;
if (top_p > 0.0f)
tp->top_p = top_p;
if (repetition_penalty > 0.0f)
tp->repetition_penalty = repetition_penalty;
if (max_new_tokens > 0)
tp->max_new_tokens = max_new_tokens;
}
float *qt3_tts(const char *text, const char *lang, const char *instruct,
const char *speaker, const float *ref_samples, int ref_n,
const char *ref_text, long long seed, float temperature,
int top_k, float top_p, float repetition_penalty,
int max_new_tokens, int *out_n) {
if (out_n)
*out_n = 0;
if (!g_ctx) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: model not loaded\n");
return nullptr;
}
if (!text || text[0] == '\0') {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: text is required\n");
return nullptr;
}
qt_tts_params tp;
fill_params(&tp, text, lang, instruct, speaker, ref_samples, ref_n,
ref_text, seed, temperature, top_k, top_p, repetition_penalty,
max_new_tokens);
qt_audio out = {0};
enum qt_status rc = qt_synthesize(g_ctx, &tp, &out);
if (rc != QT_STATUS_OK || out.n_samples <= 0 || !out.samples) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesize failed (rc=%d): %s\n",
(int)rc, qt_last_error());
qt_audio_free(&out);
return nullptr;
}
// Copy into a plain malloc buffer the Go side frees via qt3_pcm_free.
size_t bytes = (size_t)out.n_samples * sizeof(float);
float *buf = (float *)malloc(bytes);
if (!buf) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: malloc(%zu) failed\n", bytes);
qt_audio_free(&out);
return nullptr;
}
memcpy(buf, out.samples, bytes);
if (out_n)
*out_n = out.n_samples;
qt_audio_free(&out);
return buf;
}
int qt3_tts_stream(const char *text, const char *lang, const char *instruct,
const char *speaker, const float *ref_samples, int ref_n,
const char *ref_text, long long seed, float temperature,
int top_k, float top_p, float repetition_penalty,
int max_new_tokens, qt3_chunk_cb cb, void *user_data) {
if (!g_ctx) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: model not loaded\n");
return 1;
}
if (!cb) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: stream callback is null\n");
return 2;
}
if (!text || text[0] == '\0') {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: text is required\n");
return 4;
}
qt_tts_params tp;
fill_params(&tp, text, lang, instruct, speaker, ref_samples, ref_n,
ref_text, seed, temperature, top_k, top_p, repetition_penalty,
max_new_tokens);
// qt_audio_chunk_cb has the identical signature to qt3_chunk_cb
// (bool vs int return are ABI-compatible; non-zero == true).
tp.on_chunk = (qt_audio_chunk_cb)cb;
tp.on_chunk_user_data = user_data;
qt_audio out = {0}; // stays empty in streaming mode
enum qt_status rc = qt_synthesize(g_ctx, &tp, &out);
qt_audio_free(&out);
if (rc != QT_STATUS_OK && rc != QT_STATUS_CANCELLED) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: stream synth failed (rc=%d): %s\n",
(int)rc, qt_last_error());
return 3;
}
return 0;
}
void qt3_pcm_free(float *p) { free(p); }
void qt3_unload(void) {
if (g_ctx) {
qt_free(g_ctx);
g_ctx = nullptr;
}
}
int qt3_n_speakers(void) { return g_ctx ? qt_n_speakers(g_ctx) : 0; }
const char *qt3_speaker_name(int i) {
return g_ctx ? qt_speaker_name(g_ctx, i) : nullptr;
}