mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-16 12:59:33 -04:00
162 lines
4.5 KiB
C++
162 lines
4.5 KiB
C++
#include "goqwen3ttscpp.h"
|
|
#include "ggml-backend.h"
|
|
#include "qwen3_tts.h"
|
|
|
|
#include <cmath>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <string>
|
|
|
|
using namespace qwen3_tts;
|
|
|
|
// Global engine (loaded once, reused across requests)
|
|
static Qwen3TTS *g_engine = nullptr;
|
|
static bool g_loaded = false;
|
|
static int g_threads = 4;
|
|
|
|
static void ggml_log_cb(enum ggml_log_level level, const char *log, void *data) {
|
|
const char *level_str;
|
|
if (!log)
|
|
return;
|
|
switch (level) {
|
|
case GGML_LOG_LEVEL_DEBUG:
|
|
level_str = "DEBUG";
|
|
break;
|
|
case GGML_LOG_LEVEL_INFO:
|
|
level_str = "INFO";
|
|
break;
|
|
case GGML_LOG_LEVEL_WARN:
|
|
level_str = "WARN";
|
|
break;
|
|
case GGML_LOG_LEVEL_ERROR:
|
|
level_str = "ERROR";
|
|
break;
|
|
default:
|
|
level_str = "?????";
|
|
break;
|
|
}
|
|
fprintf(stderr, "[%-5s] ", level_str);
|
|
fputs(log, stderr);
|
|
fflush(stderr);
|
|
}
|
|
|
|
// Map language string to language_id token used by the model
|
|
static int language_to_id(const char *lang) {
|
|
if (!lang || lang[0] == '\0')
|
|
return 2050; // default: English
|
|
std::string l(lang);
|
|
if (l == "en")
|
|
return 2050;
|
|
if (l == "ru")
|
|
return 2069;
|
|
if (l == "zh")
|
|
return 2055;
|
|
if (l == "ja")
|
|
return 2058;
|
|
if (l == "ko")
|
|
return 2064;
|
|
if (l == "de")
|
|
return 2053;
|
|
if (l == "fr")
|
|
return 2061;
|
|
if (l == "es")
|
|
return 2054;
|
|
if (l == "it")
|
|
return 2056;
|
|
if (l == "pt")
|
|
return 2057;
|
|
fprintf(stderr, "[qwen3-tts-cpp] Unknown language '%s', defaulting to English\n",
|
|
lang);
|
|
return 2050;
|
|
}
|
|
|
|
int load_model(const char *model_dir, int n_threads) {
|
|
ggml_log_set(ggml_log_cb, nullptr);
|
|
ggml_backend_load_all();
|
|
|
|
if (n_threads <= 0)
|
|
n_threads = 4;
|
|
g_threads = n_threads;
|
|
|
|
fprintf(stderr, "[qwen3-tts-cpp] Loading models from %s (threads=%d)\n",
|
|
model_dir, n_threads);
|
|
|
|
g_engine = new Qwen3TTS();
|
|
if (!g_engine->load_models(model_dir)) {
|
|
fprintf(stderr, "[qwen3-tts-cpp] FATAL: failed to load models from %s\n",
|
|
model_dir);
|
|
delete g_engine;
|
|
g_engine = nullptr;
|
|
return 1;
|
|
}
|
|
|
|
g_loaded = true;
|
|
fprintf(stderr, "[qwen3-tts-cpp] Models loaded successfully\n");
|
|
return 0;
|
|
}
|
|
|
|
int synthesize(const char *text, const char *ref_audio_path, const char *dst,
|
|
const char *language, float temperature, float top_p,
|
|
int top_k, float repetition_penalty, int max_audio_tokens,
|
|
int n_threads) {
|
|
if (!g_loaded || !g_engine) {
|
|
fprintf(stderr, "[qwen3-tts-cpp] ERROR: models not loaded\n");
|
|
return 1;
|
|
}
|
|
|
|
if (!text || !dst) {
|
|
fprintf(stderr, "[qwen3-tts-cpp] ERROR: text and dst are required\n");
|
|
return 2;
|
|
}
|
|
|
|
tts_params params;
|
|
params.max_audio_tokens = max_audio_tokens > 0 ? max_audio_tokens : 4096;
|
|
params.temperature = temperature;
|
|
params.top_p = top_p;
|
|
params.top_k = top_k;
|
|
params.repetition_penalty = repetition_penalty;
|
|
params.n_threads = n_threads > 0 ? n_threads : g_threads;
|
|
params.language_id = language_to_id(language);
|
|
|
|
fprintf(stderr, "[qwen3-tts-cpp] Synthesizing: text='%.50s%s', lang_id=%d, "
|
|
"temp=%.2f, threads=%d\n",
|
|
text, (strlen(text) > 50 ? "..." : ""), params.language_id,
|
|
temperature, params.n_threads);
|
|
|
|
tts_result result;
|
|
bool has_ref = ref_audio_path && ref_audio_path[0] != '\0';
|
|
|
|
if (has_ref) {
|
|
fprintf(stderr, "[qwen3-tts-cpp] Voice cloning with ref: %s\n",
|
|
ref_audio_path);
|
|
result = g_engine->synthesize_with_voice(text, ref_audio_path, params);
|
|
} else {
|
|
result = g_engine->synthesize(text, params);
|
|
}
|
|
|
|
if (!result.success) {
|
|
fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesis failed: %s\n",
|
|
result.error_msg.c_str());
|
|
return 3;
|
|
}
|
|
|
|
int n_samples = (int)result.audio.size();
|
|
if (n_samples == 0) {
|
|
fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesis produced no samples\n");
|
|
return 4;
|
|
}
|
|
|
|
fprintf(stderr,
|
|
"[qwen3-tts-cpp] Synthesis done: %d samples (%.2fs @ 24kHz)\n",
|
|
n_samples, (float)n_samples / 24000.0f);
|
|
|
|
if (!save_audio_file(dst, result.audio, result.sample_rate)) {
|
|
fprintf(stderr, "[qwen3-tts-cpp] ERROR: failed to write %s\n", dst);
|
|
return 5;
|
|
}
|
|
|
|
fprintf(stderr, "[qwen3-tts-cpp] Wrote %s\n", dst);
|
|
return 0;
|
|
}
|