mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-14 11:49:33 -04:00
feat(omnivoice-cpp): add OmniVoice TTS backend (file + streaming, voice cloning + voice design) (#10310)
* feat(omnivoice-cpp): add C wrapper + CMake/Makefile build over OmniVoice ov_* ABI Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): add option/language parsing + WAV framing helpers with tests Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): wire purego binding with TTS + streaming TTSStream Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * build(omnivoice-cpp): wire backend into root Makefile Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * ci(omnivoice-cpp): add build matrix entries + dep-bump registration Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): register backend meta + image entries Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): expose as preference-only importable backend Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(gallery): add omnivoice-cpp TTS models (Q8_0 default + BF16 HQ) Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * docs(omnivoice-cpp): document the OmniVoice TTS backend Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * test(omnivoice-cpp): add env-gated e2e for TTS + streaming Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(omnivoice-cpp): honor tts.audio_path/tts.voice config as default cloning reference The model config tts.audio_path (ModelOptions.AudioPath) and tts.voice now provide a default voice-cloning reference used when a request omits Voice, so a cloned voice can be pinned in the model YAML instead of passed per request. A per-request voice still overrides. Paths resolve relative to the model dir. Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(omnivoice-cpp): add missing omnivoice-cpp-development backend meta Mirrors the whisper/vibevoice convention: a -development meta aggregating the master-tagged image variants (the production meta and per-variant prod+dev image entries already existed; only the development meta aggregator was missing). Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
166
backend/go/omnivoice-cpp/cpp/gomnivoicecpp.cpp
Normal file
166
backend/go/omnivoice-cpp/cpp/gomnivoicecpp.cpp
Normal file
@@ -0,0 +1,166 @@
|
||||
#include "gomnivoicecpp.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "omnivoice.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
static ov_context *g_ctx = nullptr;
|
||||
|
||||
static void ggml_log_cb(enum ggml_log_level level, const char *log,
|
||||
void * /*data*/) {
|
||||
if (!log)
|
||||
return;
|
||||
const char *lvl = "?????";
|
||||
switch (level) {
|
||||
case GGML_LOG_LEVEL_DEBUG: lvl = "DEBUG"; break;
|
||||
case GGML_LOG_LEVEL_INFO: lvl = "INFO"; break;
|
||||
case GGML_LOG_LEVEL_WARN: lvl = "WARN"; break;
|
||||
case GGML_LOG_LEVEL_ERROR: lvl = "ERROR"; break;
|
||||
default: break;
|
||||
}
|
||||
fprintf(stderr, "[%-5s] %s", lvl, log);
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
int omni_load(const char *model_path, const char *codec_path, int use_fa,
|
||||
int clamp_fp16) {
|
||||
ggml_log_set(ggml_log_cb, nullptr);
|
||||
ggml_backend_load_all();
|
||||
|
||||
if (!model_path || model_path[0] == '\0') {
|
||||
fprintf(stderr, "[omnivoice-cpp] ERROR: model_path is required\n");
|
||||
return 1;
|
||||
}
|
||||
if (!codec_path || codec_path[0] == '\0') {
|
||||
fprintf(stderr, "[omnivoice-cpp] ERROR: codec_path is required\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
ov_init_params p;
|
||||
ov_init_default_params(&p);
|
||||
p.model_path = model_path;
|
||||
p.codec_path = codec_path;
|
||||
p.use_fa = use_fa != 0;
|
||||
p.clamp_fp16 = clamp_fp16 != 0;
|
||||
|
||||
fprintf(stderr, "[omnivoice-cpp] Loading model=%s codec=%s\n", model_path,
|
||||
codec_path);
|
||||
|
||||
g_ctx = ov_init(&p);
|
||||
if (!g_ctx) {
|
||||
fprintf(stderr, "[omnivoice-cpp] FATAL: ov_init failed: %s\n",
|
||||
ov_last_error());
|
||||
return 3;
|
||||
}
|
||||
fprintf(stderr, "[omnivoice-cpp] Model loaded (%s)\n", ov_version());
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Fill an ov_tts_params from the flat wrapper arguments.
|
||||
static void fill_params(ov_tts_params *tp, const char *text, const char *lang,
|
||||
const char *instruct, const float *ref_samples,
|
||||
int ref_n, const char *ref_text, long long seed,
|
||||
int denoise) {
|
||||
ov_tts_default_params(tp);
|
||||
tp->text = text ? text : "";
|
||||
tp->lang = lang ? lang : "";
|
||||
if (instruct && instruct[0] != '\0')
|
||||
tp->instruct = instruct;
|
||||
if (ref_samples && ref_n > 0) {
|
||||
tp->ref_audio_24k = ref_samples;
|
||||
tp->ref_n_samples = ref_n;
|
||||
if (ref_text && ref_text[0] != '\0')
|
||||
tp->ref_text = ref_text;
|
||||
tp->denoise = denoise != 0;
|
||||
}
|
||||
if (seed >= 0)
|
||||
tp->mg_seed = (uint64_t)seed;
|
||||
}
|
||||
|
||||
float *omni_tts(const char *text, const char *lang, const char *instruct,
|
||||
const float *ref_samples, int ref_n, const char *ref_text,
|
||||
long long seed, int denoise, int *out_n) {
|
||||
if (out_n)
|
||||
*out_n = 0;
|
||||
if (!g_ctx) {
|
||||
fprintf(stderr, "[omnivoice-cpp] ERROR: model not loaded\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (!text || text[0] == '\0') {
|
||||
fprintf(stderr, "[omnivoice-cpp] ERROR: text is required\n");
|
||||
return nullptr; // omni_tts: out_n already 0
|
||||
}
|
||||
ov_tts_params tp;
|
||||
fill_params(&tp, text, lang, instruct, ref_samples, ref_n, ref_text, seed,
|
||||
denoise);
|
||||
|
||||
ov_audio out = {0};
|
||||
enum ov_status rc = ov_synthesize(g_ctx, &tp, &out);
|
||||
if (rc != OV_STATUS_OK || out.n_samples <= 0 || !out.samples) {
|
||||
fprintf(stderr, "[omnivoice-cpp] ERROR: synthesize failed (rc=%d): %s\n",
|
||||
(int)rc, ov_last_error());
|
||||
ov_audio_free(&out);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Copy into a plain malloc buffer the Go side can free symmetrically via
|
||||
// omni_pcm_free; then release the ov_audio-owned buffer.
|
||||
size_t bytes = (size_t)out.n_samples * sizeof(float);
|
||||
float *buf = (float *)malloc(bytes);
|
||||
if (!buf) {
|
||||
fprintf(stderr, "[omnivoice-cpp] ERROR: malloc(%zu) failed\n", bytes);
|
||||
ov_audio_free(&out);
|
||||
return nullptr;
|
||||
}
|
||||
memcpy(buf, out.samples, bytes);
|
||||
if (out_n)
|
||||
*out_n = out.n_samples;
|
||||
ov_audio_free(&out);
|
||||
return buf;
|
||||
}
|
||||
|
||||
int omni_tts_stream(const char *text, const char *lang, const char *instruct,
|
||||
const float *ref_samples, int ref_n, const char *ref_text,
|
||||
long long seed, int denoise, omni_pcm_chunk_cb cb,
|
||||
void *user_data) {
|
||||
if (!g_ctx) {
|
||||
fprintf(stderr, "[omnivoice-cpp] ERROR: model not loaded\n");
|
||||
return 1;
|
||||
}
|
||||
if (!cb) {
|
||||
fprintf(stderr, "[omnivoice-cpp] ERROR: stream callback is null\n");
|
||||
return 2;
|
||||
}
|
||||
if (!text || text[0] == '\0') {
|
||||
fprintf(stderr, "[omnivoice-cpp] ERROR: text is required\n");
|
||||
return 4;
|
||||
}
|
||||
ov_tts_params tp;
|
||||
fill_params(&tp, text, lang, instruct, ref_samples, ref_n, ref_text, seed,
|
||||
denoise);
|
||||
// ov_audio_chunk_cb has the identical signature to omni_pcm_chunk_cb
|
||||
// (bool vs int return are ABI-compatible; non-zero == true).
|
||||
tp.on_chunk = (ov_audio_chunk_cb)cb;
|
||||
tp.on_chunk_user_data = user_data;
|
||||
|
||||
ov_audio out = {0}; // stays empty in streaming mode
|
||||
enum ov_status rc = ov_synthesize(g_ctx, &tp, &out);
|
||||
ov_audio_free(&out);
|
||||
if (rc != OV_STATUS_OK && rc != OV_STATUS_CANCELLED) {
|
||||
fprintf(stderr, "[omnivoice-cpp] ERROR: stream synth failed (rc=%d): %s\n",
|
||||
(int)rc, ov_last_error());
|
||||
return 3;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void omni_pcm_free(float *p) { free(p); }
|
||||
|
||||
void omni_unload(void) {
|
||||
if (g_ctx) {
|
||||
ov_free(g_ctx);
|
||||
g_ctx = nullptr;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user