mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-14 11:49:33 -04:00
feat(qwen3-tts-cpp): migrate to ServeurpersoCom/qwentts.cpp (streaming, speakers, voice design) (#10316)
* feat(qwen3-tts-cpp): repoint upstream to ServeurpersoCom/qwentts.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): flatten qt_* ABI into qt3_* purego shim Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): build shim against upstream qwen-core static lib Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): add option/language/voice/sampling parsing Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): add 24kHz WAV encode/decode/stream-header helpers Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): purego backend with streaming, speakers, voice design Map TTSRequest onto qwentts.cpp: instructions->instruct, voice->named speaker or clone-reference path, params map->ref_text + sampling. Add TTSStream over the qt chunk callback. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * test(qwen3-tts-cpp): unit specs + build-gated TTS/TTSStream e2e Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * fix(qwen3-tts-cpp): close defensive PCM-free gap on zero-sample result Register CppPCMFree before the n<=0 guard so a non-null buffer with zero samples cannot leak (the C contract returns NULL on failure, so this is defensive). Raised in code review. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(qwen3-tts-cpp): advertise TTSStream capability Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * chore(qwen3-tts-cpp): update backend index metadata for qwentts.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(gallery): qwentts.cpp models - base/customvoice/voicedesign, Q8_0 & Q4_K_M Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * docs(qwen3-tts-cpp): release note for qwentts.cpp migration Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * test(qwen3-tts-cpp): cover audio_path voice-cloning fallback Add resolveRequest unit specs (config audio_path used as the clone reference when Voice is empty; per-request audio Voice overrides it; a named-speaker Voice does not trigger cloning) plus a real-inference e2e that clones from audio_path (confirmed ref_spk_emb=yes in the pipeline). Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * chore(qwen3-tts-cpp): drop the release-note doc Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -1,161 +1,191 @@
|
||||
#include "goqwen3ttscpp.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "qwen3_tts.h"
|
||||
#include "qwen.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
|
||||
using namespace qwen3_tts;
|
||||
static qt_context *g_ctx = nullptr;
|
||||
|
||||
// Global engine (loaded once, reused across requests)
|
||||
static Qwen3TTS *g_engine = nullptr;
|
||||
static bool g_loaded = false;
|
||||
static int g_threads = 4;
|
||||
|
||||
static void ggml_log_cb(enum ggml_log_level level, const char *log, void *data) {
|
||||
const char *level_str;
|
||||
static void ggml_log_cb(enum ggml_log_level level, const char *log,
|
||||
void * /*data*/) {
|
||||
if (!log)
|
||||
return;
|
||||
const char *lvl = "?????";
|
||||
switch (level) {
|
||||
case GGML_LOG_LEVEL_DEBUG:
|
||||
level_str = "DEBUG";
|
||||
break;
|
||||
case GGML_LOG_LEVEL_INFO:
|
||||
level_str = "INFO";
|
||||
break;
|
||||
case GGML_LOG_LEVEL_WARN:
|
||||
level_str = "WARN";
|
||||
break;
|
||||
case GGML_LOG_LEVEL_ERROR:
|
||||
level_str = "ERROR";
|
||||
break;
|
||||
default:
|
||||
level_str = "?????";
|
||||
break;
|
||||
case GGML_LOG_LEVEL_DEBUG: lvl = "DEBUG"; break;
|
||||
case GGML_LOG_LEVEL_INFO: lvl = "INFO"; break;
|
||||
case GGML_LOG_LEVEL_WARN: lvl = "WARN"; break;
|
||||
case GGML_LOG_LEVEL_ERROR: lvl = "ERROR"; break;
|
||||
default: break;
|
||||
}
|
||||
fprintf(stderr, "[%-5s] ", level_str);
|
||||
fputs(log, stderr);
|
||||
fprintf(stderr, "[%-5s] %s", lvl, log);
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
// Map language string to language_id token used by the model
|
||||
static int language_to_id(const char *lang) {
|
||||
if (!lang || lang[0] == '\0')
|
||||
return 2050; // default: English
|
||||
std::string l(lang);
|
||||
if (l == "en")
|
||||
return 2050;
|
||||
if (l == "ru")
|
||||
return 2069;
|
||||
if (l == "zh")
|
||||
return 2055;
|
||||
if (l == "ja")
|
||||
return 2058;
|
||||
if (l == "ko")
|
||||
return 2064;
|
||||
if (l == "de")
|
||||
return 2053;
|
||||
if (l == "fr")
|
||||
return 2061;
|
||||
if (l == "es")
|
||||
return 2054;
|
||||
if (l == "it")
|
||||
return 2056;
|
||||
if (l == "pt")
|
||||
return 2057;
|
||||
fprintf(stderr, "[qwen3-tts-cpp] Unknown language '%s', defaulting to English\n",
|
||||
lang);
|
||||
return 2050;
|
||||
}
|
||||
|
||||
int load_model(const char *model_dir, int n_threads) {
|
||||
int qt3_load(const char *talker_path, const char *codec_path, int use_fa,
|
||||
int clamp_fp16) {
|
||||
ggml_log_set(ggml_log_cb, nullptr);
|
||||
ggml_backend_load_all();
|
||||
|
||||
if (n_threads <= 0)
|
||||
n_threads = 4;
|
||||
g_threads = n_threads;
|
||||
|
||||
fprintf(stderr, "[qwen3-tts-cpp] Loading models from %s (threads=%d)\n",
|
||||
model_dir, n_threads);
|
||||
|
||||
g_engine = new Qwen3TTS();
|
||||
if (!g_engine->load_models(model_dir)) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] FATAL: failed to load models from %s\n",
|
||||
model_dir);
|
||||
delete g_engine;
|
||||
g_engine = nullptr;
|
||||
if (!talker_path || talker_path[0] == '\0') {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: talker_path is required\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
g_loaded = true;
|
||||
fprintf(stderr, "[qwen3-tts-cpp] Models loaded successfully\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int synthesize(const char *text, const char *ref_audio_path, const char *dst,
|
||||
const char *language, float temperature, float top_p,
|
||||
int top_k, float repetition_penalty, int max_audio_tokens,
|
||||
int n_threads) {
|
||||
if (!g_loaded || !g_engine) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: models not loaded\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!text || !dst) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: text and dst are required\n");
|
||||
if (!codec_path || codec_path[0] == '\0') {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: codec_path is required\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
tts_params params;
|
||||
params.max_audio_tokens = max_audio_tokens > 0 ? max_audio_tokens : 4096;
|
||||
params.temperature = temperature;
|
||||
params.top_p = top_p;
|
||||
params.top_k = top_k;
|
||||
params.repetition_penalty = repetition_penalty;
|
||||
params.n_threads = n_threads > 0 ? n_threads : g_threads;
|
||||
params.language_id = language_to_id(language);
|
||||
qt_init_params p;
|
||||
qt_init_default_params(&p);
|
||||
p.talker_path = talker_path;
|
||||
p.codec_path = codec_path;
|
||||
p.use_fa = use_fa != 0;
|
||||
p.clamp_fp16 = clamp_fp16 != 0;
|
||||
|
||||
fprintf(stderr, "[qwen3-tts-cpp] Synthesizing: text='%.50s%s', lang_id=%d, "
|
||||
"temp=%.2f, threads=%d\n",
|
||||
text, (strlen(text) > 50 ? "..." : ""), params.language_id,
|
||||
temperature, params.n_threads);
|
||||
fprintf(stderr, "[qwen3-tts-cpp] Loading talker=%s codec=%s\n", talker_path,
|
||||
codec_path);
|
||||
|
||||
tts_result result;
|
||||
bool has_ref = ref_audio_path && ref_audio_path[0] != '\0';
|
||||
|
||||
if (has_ref) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] Voice cloning with ref: %s\n",
|
||||
ref_audio_path);
|
||||
result = g_engine->synthesize_with_voice(text, ref_audio_path, params);
|
||||
} else {
|
||||
result = g_engine->synthesize(text, params);
|
||||
}
|
||||
|
||||
if (!result.success) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesis failed: %s\n",
|
||||
result.error_msg.c_str());
|
||||
g_ctx = qt_init(&p);
|
||||
if (!g_ctx) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] FATAL: qt_init failed: %s\n",
|
||||
qt_last_error());
|
||||
return 3;
|
||||
}
|
||||
|
||||
int n_samples = (int)result.audio.size();
|
||||
if (n_samples == 0) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesis produced no samples\n");
|
||||
return 4;
|
||||
}
|
||||
|
||||
fprintf(stderr,
|
||||
"[qwen3-tts-cpp] Synthesis done: %d samples (%.2fs @ 24kHz)\n",
|
||||
n_samples, (float)n_samples / 24000.0f);
|
||||
|
||||
if (!save_audio_file(dst, result.audio, result.sample_rate)) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: failed to write %s\n", dst);
|
||||
return 5;
|
||||
}
|
||||
|
||||
fprintf(stderr, "[qwen3-tts-cpp] Wrote %s\n", dst);
|
||||
fprintf(stderr, "[qwen3-tts-cpp] Model loaded (%s)\n", qt_version());
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Fill a qt_tts_params from the flat wrapper arguments. Unset/zero scalars keep
|
||||
// the qt defaults (temperature 0.9, top_k 50, top_p 1.0, rep 1.05, max 2048).
|
||||
static void fill_params(qt_tts_params *tp, const char *text, const char *lang,
|
||||
const char *instruct, const char *speaker,
|
||||
const float *ref_samples, int ref_n,
|
||||
const char *ref_text, long long seed, float temperature,
|
||||
int top_k, float top_p, float repetition_penalty,
|
||||
int max_new_tokens) {
|
||||
qt_tts_default_params(tp);
|
||||
tp->text = text ? text : "";
|
||||
if (lang && lang[0] != '\0')
|
||||
tp->lang = lang; // else keep default NULL -> auto
|
||||
if (instruct && instruct[0] != '\0')
|
||||
tp->instruct = instruct;
|
||||
if (speaker && speaker[0] != '\0')
|
||||
tp->speaker = speaker;
|
||||
if (ref_samples && ref_n > 0) {
|
||||
tp->ref_audio_24k = ref_samples;
|
||||
tp->ref_n_samples = ref_n;
|
||||
if (ref_text && ref_text[0] != '\0')
|
||||
tp->ref_text = ref_text;
|
||||
}
|
||||
if (seed >= 0)
|
||||
tp->seed = (int64_t)seed; // else default -1 (random)
|
||||
if (temperature > 0.0f)
|
||||
tp->temperature = temperature;
|
||||
if (top_k > 0)
|
||||
tp->top_k = top_k;
|
||||
if (top_p > 0.0f)
|
||||
tp->top_p = top_p;
|
||||
if (repetition_penalty > 0.0f)
|
||||
tp->repetition_penalty = repetition_penalty;
|
||||
if (max_new_tokens > 0)
|
||||
tp->max_new_tokens = max_new_tokens;
|
||||
}
|
||||
|
||||
float *qt3_tts(const char *text, const char *lang, const char *instruct,
|
||||
const char *speaker, const float *ref_samples, int ref_n,
|
||||
const char *ref_text, long long seed, float temperature,
|
||||
int top_k, float top_p, float repetition_penalty,
|
||||
int max_new_tokens, int *out_n) {
|
||||
if (out_n)
|
||||
*out_n = 0;
|
||||
if (!g_ctx) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: model not loaded\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (!text || text[0] == '\0') {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: text is required\n");
|
||||
return nullptr;
|
||||
}
|
||||
qt_tts_params tp;
|
||||
fill_params(&tp, text, lang, instruct, speaker, ref_samples, ref_n,
|
||||
ref_text, seed, temperature, top_k, top_p, repetition_penalty,
|
||||
max_new_tokens);
|
||||
|
||||
qt_audio out = {0};
|
||||
enum qt_status rc = qt_synthesize(g_ctx, &tp, &out);
|
||||
if (rc != QT_STATUS_OK || out.n_samples <= 0 || !out.samples) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesize failed (rc=%d): %s\n",
|
||||
(int)rc, qt_last_error());
|
||||
qt_audio_free(&out);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Copy into a plain malloc buffer the Go side frees via qt3_pcm_free.
|
||||
size_t bytes = (size_t)out.n_samples * sizeof(float);
|
||||
float *buf = (float *)malloc(bytes);
|
||||
if (!buf) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: malloc(%zu) failed\n", bytes);
|
||||
qt_audio_free(&out);
|
||||
return nullptr;
|
||||
}
|
||||
memcpy(buf, out.samples, bytes);
|
||||
if (out_n)
|
||||
*out_n = out.n_samples;
|
||||
qt_audio_free(&out);
|
||||
return buf;
|
||||
}
|
||||
|
||||
int qt3_tts_stream(const char *text, const char *lang, const char *instruct,
|
||||
const char *speaker, const float *ref_samples, int ref_n,
|
||||
const char *ref_text, long long seed, float temperature,
|
||||
int top_k, float top_p, float repetition_penalty,
|
||||
int max_new_tokens, qt3_chunk_cb cb, void *user_data) {
|
||||
if (!g_ctx) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: model not loaded\n");
|
||||
return 1;
|
||||
}
|
||||
if (!cb) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: stream callback is null\n");
|
||||
return 2;
|
||||
}
|
||||
if (!text || text[0] == '\0') {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: text is required\n");
|
||||
return 4;
|
||||
}
|
||||
qt_tts_params tp;
|
||||
fill_params(&tp, text, lang, instruct, speaker, ref_samples, ref_n,
|
||||
ref_text, seed, temperature, top_k, top_p, repetition_penalty,
|
||||
max_new_tokens);
|
||||
// qt_audio_chunk_cb has the identical signature to qt3_chunk_cb
|
||||
// (bool vs int return are ABI-compatible; non-zero == true).
|
||||
tp.on_chunk = (qt_audio_chunk_cb)cb;
|
||||
tp.on_chunk_user_data = user_data;
|
||||
|
||||
qt_audio out = {0}; // stays empty in streaming mode
|
||||
enum qt_status rc = qt_synthesize(g_ctx, &tp, &out);
|
||||
qt_audio_free(&out);
|
||||
if (rc != QT_STATUS_OK && rc != QT_STATUS_CANCELLED) {
|
||||
fprintf(stderr, "[qwen3-tts-cpp] ERROR: stream synth failed (rc=%d): %s\n",
|
||||
(int)rc, qt_last_error());
|
||||
return 3;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void qt3_pcm_free(float *p) { free(p); }
|
||||
|
||||
void qt3_unload(void) {
|
||||
if (g_ctx) {
|
||||
qt_free(g_ctx);
|
||||
g_ctx = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
int qt3_n_speakers(void) { return g_ctx ? qt_n_speakers(g_ctx) : 0; }
|
||||
|
||||
const char *qt3_speaker_name(int i) {
|
||||
return g_ctx ? qt_speaker_name(g_ctx, i) : nullptr;
|
||||
}
|
||||
|
||||
@@ -1,12 +1,47 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
int load_model(const char *model_dir, int n_threads);
|
||||
int synthesize(const char *text, const char *ref_audio_path, const char *dst,
|
||||
const char *language, float temperature, float top_p,
|
||||
int top_k, float repetition_penalty, int max_audio_tokens,
|
||||
int n_threads);
|
||||
|
||||
// Streaming PCM chunk callback. samples is mono float PCM at 24 kHz, valid
|
||||
// only for the duration of the call. Return non-zero to continue, 0 to abort.
|
||||
typedef int (*qt3_chunk_cb)(const float *samples, int n_samples,
|
||||
void *user_data);
|
||||
|
||||
// Load the talker + codec/tokenizer GGUFs. use_fa / clamp_fp16 map to
|
||||
// qt_init_params (the qt ABI exposes no thread count; ggml uses its own
|
||||
// default). Returns 0 on success, non-zero on failure.
|
||||
int qt3_load(const char *talker_path, const char *codec_path, int use_fa,
|
||||
int clamp_fp16);
|
||||
|
||||
// Synthesize to a malloc'd float PCM buffer (caller frees via qt3_pcm_free).
|
||||
// The synthesis mode (base / custom_voice / voice_design) is auto-detected by
|
||||
// qt from the talker GGUF; speaker is honoured only for custom_voice, instruct
|
||||
// for voice_design / custom_voice, and ref_samples (+ optional ref_text) drive
|
||||
// base-mode cloning. qt enforces the rules and we surface qt_last_error() on
|
||||
// QT_STATUS_MODE_INVALID. Writes the sample count to *out_n. Returns NULL on
|
||||
// failure (out_n set to 0).
|
||||
float *qt3_tts(const char *text, const char *lang, const char *instruct,
|
||||
const char *speaker, const float *ref_samples, int ref_n,
|
||||
const char *ref_text, long long seed, float temperature,
|
||||
int top_k, float top_p, float repetition_penalty,
|
||||
int max_new_tokens, int *out_n);
|
||||
|
||||
// Streaming synthesis: cb is invoked per PCM chunk as audio is produced. Same
|
||||
// param semantics as qt3_tts. Returns 0 on success.
|
||||
int qt3_tts_stream(const char *text, const char *lang, const char *instruct,
|
||||
const char *speaker, const float *ref_samples, int ref_n,
|
||||
const char *ref_text, long long seed, float temperature,
|
||||
int top_k, float top_p, float repetition_penalty,
|
||||
int max_new_tokens, qt3_chunk_cb cb, void *user_data);
|
||||
|
||||
// Free a buffer returned by qt3_tts.
|
||||
void qt3_pcm_free(float *p);
|
||||
|
||||
// Release the qt context.
|
||||
void qt3_unload(void);
|
||||
|
||||
// Named-speaker introspection (custom_voice models). Returns 0 / NULL when no
|
||||
// model is loaded or the index is out of range.
|
||||
int qt3_n_speakers(void);
|
||||
const char *qt3_speaker_name(int i);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user