diff --git a/backend/go/qwen3-tts-cpp/CMakeLists.txt b/backend/go/qwen3-tts-cpp/CMakeLists.txt index abf10621b..1245d8db6 100644 --- a/backend/go/qwen3-tts-cpp/CMakeLists.txt +++ b/backend/go/qwen3-tts-cpp/CMakeLists.txt @@ -3,35 +3,36 @@ project(goqwen3ttscpp LANGUAGES C CXX) set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(QWEN3TTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sources/qwen3-tts.cpp) +set(QWENTTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sources/qwentts.cpp) # Override upstream's CMAKE_CUDA_ARCHITECTURES before add_subdirectory. if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real") endif() -# Build ggml from the upstream's submodule FIRST, so that ggml/ggml-base/ggml-cpu -# CMake targets exist when the upstream project references them by name. -# The upstream CMakeLists.txt uses target_link_libraries(... ggml ggml-base ggml-cpu) -# with target_link_directories pointing at a pre-built ggml/build/. By adding ggml -# as a subdirectory here, CMake resolves those names as targets instead. -add_subdirectory(${QWEN3TTS_DIR}/ggml ggml EXCLUDE_FROM_ALL) +# Add the upstream project. Its own CMakeLists adds ggml + cpp-httplib + yyjson +# and builds qwen-core (STATIC, the qt_* impl). EXCLUDE_FROM_ALL keeps its CLI +# tools / tts-server / tests from building unless referenced. +add_subdirectory(${QWENTTS_DIR} qwentts EXCLUDE_FROM_ALL) -# Now add the upstream project -add_subdirectory(${QWEN3TTS_DIR} qwen3tts EXCLUDE_FROM_ALL) +# Upstream generates version.h into its own CMAKE_CURRENT_BINARY_DIR and adds +# the top-level ${CMAKE_BINARY_DIR} to qwen-core's include path. Under +# add_subdirectory those two dirs differ (/qwentts vs ), so +# qwen.cpp cannot find version.h. Point qwen-core at the subproject binary dir +# where version.h is actually generated. (Fix lives here, never in the fetched +# upstream checkout.) +target_include_directories(qwen-core PRIVATE ${CMAKE_BINARY_DIR}/qwentts) add_library(goqwen3ttscpp MODULE cpp/goqwen3ttscpp.cpp) -target_link_libraries(goqwen3ttscpp PRIVATE qwen3_tts) +target_link_libraries(goqwen3ttscpp PRIVATE qwen-core) -target_include_directories(goqwen3ttscpp PRIVATE ${QWEN3TTS_DIR}/src) -target_include_directories(goqwen3ttscpp SYSTEM PRIVATE ${QWEN3TTS_DIR}/ggml/include) +target_include_directories(goqwen3ttscpp PRIVATE ${QWENTTS_DIR}/src) +target_include_directories(goqwen3ttscpp SYSTEM PRIVATE ${QWENTTS_DIR}/ggml/include) -# Link GPU backends if available -foreach(backend blas cuda metal vulkan) +# Link GPU backends if the upstream ggml created them. +foreach(backend blas cuda metal vulkan sycl) if(TARGET ggml-${backend}) target_link_libraries(goqwen3ttscpp PRIVATE ggml-${backend}) - string(TOUPPER ${backend} BACKEND_UPPER) - target_compile_definitions(goqwen3ttscpp PRIVATE QWEN3TTS_HAVE_${BACKEND_UPPER}) if(backend STREQUAL "cuda") find_package(CUDAToolkit QUIET) if(CUDAToolkit_FOUND) @@ -44,12 +45,8 @@ endforeach() if(MSVC) target_compile_options(goqwen3ttscpp PRIVATE /W4 /wd4100 /wd4505) else() - target_compile_options(goqwen3ttscpp PRIVATE -Wall -Wextra -Wshadow -Wconversion - -Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion) -endif() - -if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0) - target_link_libraries(goqwen3ttscpp PRIVATE stdc++fs) + target_compile_options(goqwen3ttscpp PRIVATE -Wall -Wextra + -Wno-unused-parameter -Wno-unused-function) endif() set_property(TARGET goqwen3ttscpp PROPERTY CXX_STANDARD 17) diff --git a/backend/go/qwen3-tts-cpp/Makefile b/backend/go/qwen3-tts-cpp/Makefile index e35b1c133..e5f6a838f 100644 --- a/backend/go/qwen3-tts-cpp/Makefile +++ b/backend/go/qwen3-tts-cpp/Makefile @@ -6,9 +6,9 @@ GOCMD?=go GO_TAGS?= JOBS?=$(shell nproc --ignore=1) -# qwen3-tts.cpp version -QWEN3TTS_REPO?=https://github.com/predict-woo/qwen3-tts.cpp -QWEN3TTS_CPP_VERSION?=136e5d36c17083da0321fd96512dc7b263f94a44 +# qwentts.cpp version +QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp +QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4 SO_TARGET?=libgoqwen3ttscpp.so CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF @@ -49,9 +49,9 @@ ifeq ($(BUILD_TYPE),sycl_f32) -DCMAKE_CXX_COMPILER=icpx endif -sources/qwen3-tts.cpp: - mkdir -p sources/qwen3-tts.cpp - cd sources/qwen3-tts.cpp && \ +sources/qwentts.cpp: + mkdir -p sources/qwentts.cpp + cd sources/qwentts.cpp && \ git init && \ git remote add origin $(QWEN3TTS_REPO) && \ git fetch origin && \ @@ -78,7 +78,7 @@ package: qwen3-tts-cpp build: package clean: purge - rm -rf libgoqwen3ttscpp*.so package sources/qwen3-tts.cpp qwen3-tts-cpp + rm -rf libgoqwen3ttscpp*.so package sources/qwentts.cpp qwen3-tts-cpp purge: rm -rf build* @@ -88,24 +88,24 @@ purge: # Build all variants (Linux only) ifeq ($(UNAME_S),Linux) -libgoqwen3ttscpp-avx.so: sources/qwen3-tts.cpp +libgoqwen3ttscpp-avx.so: sources/qwentts.cpp $(info ${GREEN}I qwen3-tts-cpp build info:avx${RESET}) SO_TARGET=libgoqwen3ttscpp-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgoqwen3ttscpp-custom rm -rf build-libgoqwen3ttscpp-avx.so -libgoqwen3ttscpp-avx2.so: sources/qwen3-tts.cpp +libgoqwen3ttscpp-avx2.so: sources/qwentts.cpp $(info ${GREEN}I qwen3-tts-cpp build info:avx2${RESET}) SO_TARGET=libgoqwen3ttscpp-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgoqwen3ttscpp-custom rm -rf build-libgoqwen3ttscpp-avx2.so -libgoqwen3ttscpp-avx512.so: sources/qwen3-tts.cpp +libgoqwen3ttscpp-avx512.so: sources/qwentts.cpp $(info ${GREEN}I qwen3-tts-cpp build info:avx512${RESET}) SO_TARGET=libgoqwen3ttscpp-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgoqwen3ttscpp-custom rm -rf build-libgoqwen3ttscpp-avx512.so endif # Build fallback variant (all platforms) -libgoqwen3ttscpp-fallback.so: sources/qwen3-tts.cpp +libgoqwen3ttscpp-fallback.so: sources/qwentts.cpp $(info ${GREEN}I qwen3-tts-cpp build info:fallback${RESET}) SO_TARGET=libgoqwen3ttscpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgoqwen3ttscpp-custom rm -rf build-libgoqwen3ttscpp-fallback.so diff --git a/backend/go/qwen3-tts-cpp/audio.go b/backend/go/qwen3-tts-cpp/audio.go new file mode 100644 index 000000000..0aa13964e --- /dev/null +++ b/backend/go/qwen3-tts-cpp/audio.go @@ -0,0 +1,128 @@ +package main + +import ( + "bytes" + "encoding/binary" + "fmt" + "os" + "runtime" + + "github.com/go-audio/audio" + "github.com/go-audio/wav" +) + +const qwen3ttsSampleRate = 24000 + +// wavHeader24k returns a 44-byte WAV header for a streaming 24 kHz mono 16-bit +// PCM stream, with placeholder (0xFFFFFFFF) sizes since the total length is +// unknown up front. Emitted as the first chunk of TTSStream so the HTTP layer +// receives a self-describing WAV (the gRPC TTSStream path never sets Message, +// so the backend owns the header - see core/backend/tts.go:ModelTTSStream). +func wavHeader24k() []byte { + var buf bytes.Buffer + w := func(v any) { _ = binary.Write(&buf, binary.LittleEndian, v) } + buf.WriteString("RIFF") + w(uint32(0xFFFFFFFF)) + buf.WriteString("WAVE") + buf.WriteString("fmt ") + w(uint32(16)) // Subchunk1Size + w(uint16(1)) // PCM + w(uint16(1)) // mono + w(uint32(qwen3ttsSampleRate)) // sample rate + w(uint32(qwen3ttsSampleRate * 2)) // byte rate = SR * blockAlign + w(uint16(2)) // block align (16-bit mono) + w(uint16(16)) // bits per sample + buf.WriteString("data") + w(uint32(0xFFFFFFFF)) + return buf.Bytes() +} + +// floatToPCM16LE clamps each sample to [-1,1] and encodes it as little-endian +// signed 16-bit PCM. +func floatToPCM16LE(samples []float32) []byte { + out := make([]byte, len(samples)*2) + for i, s := range samples { + if s > 1 { + s = 1 + } else if s < -1 { + s = -1 + } + v := int16(s * 32767) + out[i*2] = byte(v) + out[i*2+1] = byte(v >> 8) + } + return out +} + +// writeWAV24k writes samples as a finalized 24 kHz mono 16-bit WAV at dst. +func writeWAV24k(dst string, samples []float32) error { + f, err := os.Create(dst) + if err != nil { + return fmt.Errorf("qwen3-tts: create %q: %w", dst, err) + } + enc := wav.NewEncoder(f, qwen3ttsSampleRate, 16, 1, 1) + ints := make([]int, len(samples)) + for i, s := range samples { + if s > 1 { + s = 1 + } else if s < -1 { + s = -1 + } + ints[i] = int(s * 32767) + } + b := &audio.IntBuffer{ + Format: &audio.Format{NumChannels: 1, SampleRate: qwen3ttsSampleRate}, + Data: ints, + SourceBitDepth: 16, + } + if err := enc.Write(b); err != nil { + _ = enc.Close() + _ = f.Close() + return fmt.Errorf("qwen3-tts: encode WAV: %w", err) + } + if err := enc.Close(); err != nil { + _ = f.Close() + return fmt.Errorf("qwen3-tts: finalize WAV: %w", err) + } + return f.Close() +} + +// readWAVAsFloat decodes a WAV file (any sample rate/channels) to a mono +// float32 slice in [-1,1] for use as cloning reference audio. qwentts expects +// 24 kHz; callers should supply 24 kHz reference clips. +func readWAVAsFloat(path string) ([]float32, error) { + f, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("qwen3-tts: open ref %q: %w", path, err) + } + defer func() { _ = f.Close() }() + + dec := wav.NewDecoder(f) + buf, err := dec.FullPCMBuffer() + if err != nil { + return nil, fmt.Errorf("qwen3-tts: decode ref %q: %w", path, err) + } + ch := int(buf.Format.NumChannels) + if ch < 1 { + ch = 1 + } + bitDepth := int(buf.SourceBitDepth) + if bitDepth == 0 { + bitDepth = 16 + } + scale := float32(int64(1) << uint(bitDepth-1)) + n := len(buf.Data) / ch + out := make([]float32, n) + for i := 0; i < n; i++ { + var acc int + for c := 0; c < ch; c++ { + acc += buf.Data[i*ch+c] + } + out[i] = float32(acc) / float32(ch) / scale + } + return out, nil +} + +// runtimeKeepAlive prevents the GC from reclaiming the reference-audio slice +// while its backing pointer is in use across the C call. +func runtimeKeepAlive(v any) { runtime.KeepAlive(v) } diff --git a/backend/go/qwen3-tts-cpp/audiopath_test.go b/backend/go/qwen3-tts-cpp/audiopath_test.go new file mode 100644 index 000000000..aae1b1aff --- /dev/null +++ b/backend/go/qwen3-tts-cpp/audiopath_test.go @@ -0,0 +1,54 @@ +package main + +import ( + "path/filepath" + + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// These specs pin the voice-selection logic in resolveRequest, in particular +// the config-level audio_path (tts.audio_path -> ModelOptions.AudioPath) being +// used as the default voice-cloning reference. No model/C library is needed: +// resolveRequest only reads the reference WAV via readWAVAsFloat (pure Go). +var _ = Describe("resolveRequest voice/clone selection", func() { + var dir, refWav string + + BeforeEach(func() { + dir = GinkgoT().TempDir() + refWav = filepath.Join(dir, "ref.wav") + // 0.5s of non-silent 24kHz mono audio as a clone reference. + samples := make([]float32, qwen3ttsSampleRate/2) + for i := range samples { + samples[i] = 0.1 + } + Expect(writeWAV24k(refWav, samples)).To(Succeed()) + }) + + It("uses the config audio_path as the clone reference when Voice is empty", func() { + q := &Qwen3TtsCpp{audioPath: refWav} + _, _, speaker, _, ref, _, err := q.resolveRequest(&pb.TTSRequest{Text: "hi"}) + Expect(err).ToNot(HaveOccurred()) + Expect(speaker).To(BeEmpty()) + Expect(len(ref)).To(Equal(qwen3ttsSampleRate / 2)) + }) + + It("lets a per-request audio Voice override audio_path", func() { + other := filepath.Join(dir, "other.wav") + Expect(writeWAV24k(other, make([]float32, 100))).To(Succeed()) + q := &Qwen3TtsCpp{audioPath: refWav} + _, _, speaker, _, ref, _, err := q.resolveRequest(&pb.TTSRequest{Text: "hi", Voice: other}) + Expect(err).ToNot(HaveOccurred()) + Expect(speaker).To(BeEmpty()) + Expect(len(ref)).To(Equal(100)) + }) + + It("does not trigger audio_path cloning for a named-speaker Voice", func() { + q := &Qwen3TtsCpp{audioPath: refWav} + _, _, speaker, _, ref, _, err := q.resolveRequest(&pb.TTSRequest{Text: "hi", Voice: "serena"}) + Expect(err).ToNot(HaveOccurred()) + Expect(speaker).To(Equal("serena")) + Expect(ref).To(BeNil()) + }) +}) diff --git a/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.cpp b/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.cpp index b2de53620..0cd62293d 100644 --- a/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.cpp +++ b/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.cpp @@ -1,161 +1,191 @@ #include "goqwen3ttscpp.h" #include "ggml-backend.h" -#include "qwen3_tts.h" +#include "qwen.h" -#include #include #include #include -#include -using namespace qwen3_tts; +static qt_context *g_ctx = nullptr; -// Global engine (loaded once, reused across requests) -static Qwen3TTS *g_engine = nullptr; -static bool g_loaded = false; -static int g_threads = 4; - -static void ggml_log_cb(enum ggml_log_level level, const char *log, void *data) { - const char *level_str; +static void ggml_log_cb(enum ggml_log_level level, const char *log, + void * /*data*/) { if (!log) return; + const char *lvl = "?????"; switch (level) { - case GGML_LOG_LEVEL_DEBUG: - level_str = "DEBUG"; - break; - case GGML_LOG_LEVEL_INFO: - level_str = "INFO"; - break; - case GGML_LOG_LEVEL_WARN: - level_str = "WARN"; - break; - case GGML_LOG_LEVEL_ERROR: - level_str = "ERROR"; - break; - default: - level_str = "?????"; - break; + case GGML_LOG_LEVEL_DEBUG: lvl = "DEBUG"; break; + case GGML_LOG_LEVEL_INFO: lvl = "INFO"; break; + case GGML_LOG_LEVEL_WARN: lvl = "WARN"; break; + case GGML_LOG_LEVEL_ERROR: lvl = "ERROR"; break; + default: break; } - fprintf(stderr, "[%-5s] ", level_str); - fputs(log, stderr); + fprintf(stderr, "[%-5s] %s", lvl, log); fflush(stderr); } -// Map language string to language_id token used by the model -static int language_to_id(const char *lang) { - if (!lang || lang[0] == '\0') - return 2050; // default: English - std::string l(lang); - if (l == "en") - return 2050; - if (l == "ru") - return 2069; - if (l == "zh") - return 2055; - if (l == "ja") - return 2058; - if (l == "ko") - return 2064; - if (l == "de") - return 2053; - if (l == "fr") - return 2061; - if (l == "es") - return 2054; - if (l == "it") - return 2056; - if (l == "pt") - return 2057; - fprintf(stderr, "[qwen3-tts-cpp] Unknown language '%s', defaulting to English\n", - lang); - return 2050; -} - -int load_model(const char *model_dir, int n_threads) { +int qt3_load(const char *talker_path, const char *codec_path, int use_fa, + int clamp_fp16) { ggml_log_set(ggml_log_cb, nullptr); ggml_backend_load_all(); - if (n_threads <= 0) - n_threads = 4; - g_threads = n_threads; - - fprintf(stderr, "[qwen3-tts-cpp] Loading models from %s (threads=%d)\n", - model_dir, n_threads); - - g_engine = new Qwen3TTS(); - if (!g_engine->load_models(model_dir)) { - fprintf(stderr, "[qwen3-tts-cpp] FATAL: failed to load models from %s\n", - model_dir); - delete g_engine; - g_engine = nullptr; + if (!talker_path || talker_path[0] == '\0') { + fprintf(stderr, "[qwen3-tts-cpp] ERROR: talker_path is required\n"); return 1; } - - g_loaded = true; - fprintf(stderr, "[qwen3-tts-cpp] Models loaded successfully\n"); - return 0; -} - -int synthesize(const char *text, const char *ref_audio_path, const char *dst, - const char *language, float temperature, float top_p, - int top_k, float repetition_penalty, int max_audio_tokens, - int n_threads) { - if (!g_loaded || !g_engine) { - fprintf(stderr, "[qwen3-tts-cpp] ERROR: models not loaded\n"); - return 1; - } - - if (!text || !dst) { - fprintf(stderr, "[qwen3-tts-cpp] ERROR: text and dst are required\n"); + if (!codec_path || codec_path[0] == '\0') { + fprintf(stderr, "[qwen3-tts-cpp] ERROR: codec_path is required\n"); return 2; } - tts_params params; - params.max_audio_tokens = max_audio_tokens > 0 ? max_audio_tokens : 4096; - params.temperature = temperature; - params.top_p = top_p; - params.top_k = top_k; - params.repetition_penalty = repetition_penalty; - params.n_threads = n_threads > 0 ? n_threads : g_threads; - params.language_id = language_to_id(language); + qt_init_params p; + qt_init_default_params(&p); + p.talker_path = talker_path; + p.codec_path = codec_path; + p.use_fa = use_fa != 0; + p.clamp_fp16 = clamp_fp16 != 0; - fprintf(stderr, "[qwen3-tts-cpp] Synthesizing: text='%.50s%s', lang_id=%d, " - "temp=%.2f, threads=%d\n", - text, (strlen(text) > 50 ? "..." : ""), params.language_id, - temperature, params.n_threads); + fprintf(stderr, "[qwen3-tts-cpp] Loading talker=%s codec=%s\n", talker_path, + codec_path); - tts_result result; - bool has_ref = ref_audio_path && ref_audio_path[0] != '\0'; - - if (has_ref) { - fprintf(stderr, "[qwen3-tts-cpp] Voice cloning with ref: %s\n", - ref_audio_path); - result = g_engine->synthesize_with_voice(text, ref_audio_path, params); - } else { - result = g_engine->synthesize(text, params); - } - - if (!result.success) { - fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesis failed: %s\n", - result.error_msg.c_str()); + g_ctx = qt_init(&p); + if (!g_ctx) { + fprintf(stderr, "[qwen3-tts-cpp] FATAL: qt_init failed: %s\n", + qt_last_error()); return 3; } - - int n_samples = (int)result.audio.size(); - if (n_samples == 0) { - fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesis produced no samples\n"); - return 4; - } - - fprintf(stderr, - "[qwen3-tts-cpp] Synthesis done: %d samples (%.2fs @ 24kHz)\n", - n_samples, (float)n_samples / 24000.0f); - - if (!save_audio_file(dst, result.audio, result.sample_rate)) { - fprintf(stderr, "[qwen3-tts-cpp] ERROR: failed to write %s\n", dst); - return 5; - } - - fprintf(stderr, "[qwen3-tts-cpp] Wrote %s\n", dst); + fprintf(stderr, "[qwen3-tts-cpp] Model loaded (%s)\n", qt_version()); return 0; } + +// Fill a qt_tts_params from the flat wrapper arguments. Unset/zero scalars keep +// the qt defaults (temperature 0.9, top_k 50, top_p 1.0, rep 1.05, max 2048). +static void fill_params(qt_tts_params *tp, const char *text, const char *lang, + const char *instruct, const char *speaker, + const float *ref_samples, int ref_n, + const char *ref_text, long long seed, float temperature, + int top_k, float top_p, float repetition_penalty, + int max_new_tokens) { + qt_tts_default_params(tp); + tp->text = text ? text : ""; + if (lang && lang[0] != '\0') + tp->lang = lang; // else keep default NULL -> auto + if (instruct && instruct[0] != '\0') + tp->instruct = instruct; + if (speaker && speaker[0] != '\0') + tp->speaker = speaker; + if (ref_samples && ref_n > 0) { + tp->ref_audio_24k = ref_samples; + tp->ref_n_samples = ref_n; + if (ref_text && ref_text[0] != '\0') + tp->ref_text = ref_text; + } + if (seed >= 0) + tp->seed = (int64_t)seed; // else default -1 (random) + if (temperature > 0.0f) + tp->temperature = temperature; + if (top_k > 0) + tp->top_k = top_k; + if (top_p > 0.0f) + tp->top_p = top_p; + if (repetition_penalty > 0.0f) + tp->repetition_penalty = repetition_penalty; + if (max_new_tokens > 0) + tp->max_new_tokens = max_new_tokens; +} + +float *qt3_tts(const char *text, const char *lang, const char *instruct, + const char *speaker, const float *ref_samples, int ref_n, + const char *ref_text, long long seed, float temperature, + int top_k, float top_p, float repetition_penalty, + int max_new_tokens, int *out_n) { + if (out_n) + *out_n = 0; + if (!g_ctx) { + fprintf(stderr, "[qwen3-tts-cpp] ERROR: model not loaded\n"); + return nullptr; + } + if (!text || text[0] == '\0') { + fprintf(stderr, "[qwen3-tts-cpp] ERROR: text is required\n"); + return nullptr; + } + qt_tts_params tp; + fill_params(&tp, text, lang, instruct, speaker, ref_samples, ref_n, + ref_text, seed, temperature, top_k, top_p, repetition_penalty, + max_new_tokens); + + qt_audio out = {0}; + enum qt_status rc = qt_synthesize(g_ctx, &tp, &out); + if (rc != QT_STATUS_OK || out.n_samples <= 0 || !out.samples) { + fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesize failed (rc=%d): %s\n", + (int)rc, qt_last_error()); + qt_audio_free(&out); + return nullptr; + } + + // Copy into a plain malloc buffer the Go side frees via qt3_pcm_free. + size_t bytes = (size_t)out.n_samples * sizeof(float); + float *buf = (float *)malloc(bytes); + if (!buf) { + fprintf(stderr, "[qwen3-tts-cpp] ERROR: malloc(%zu) failed\n", bytes); + qt_audio_free(&out); + return nullptr; + } + memcpy(buf, out.samples, bytes); + if (out_n) + *out_n = out.n_samples; + qt_audio_free(&out); + return buf; +} + +int qt3_tts_stream(const char *text, const char *lang, const char *instruct, + const char *speaker, const float *ref_samples, int ref_n, + const char *ref_text, long long seed, float temperature, + int top_k, float top_p, float repetition_penalty, + int max_new_tokens, qt3_chunk_cb cb, void *user_data) { + if (!g_ctx) { + fprintf(stderr, "[qwen3-tts-cpp] ERROR: model not loaded\n"); + return 1; + } + if (!cb) { + fprintf(stderr, "[qwen3-tts-cpp] ERROR: stream callback is null\n"); + return 2; + } + if (!text || text[0] == '\0') { + fprintf(stderr, "[qwen3-tts-cpp] ERROR: text is required\n"); + return 4; + } + qt_tts_params tp; + fill_params(&tp, text, lang, instruct, speaker, ref_samples, ref_n, + ref_text, seed, temperature, top_k, top_p, repetition_penalty, + max_new_tokens); + // qt_audio_chunk_cb has the identical signature to qt3_chunk_cb + // (bool vs int return are ABI-compatible; non-zero == true). + tp.on_chunk = (qt_audio_chunk_cb)cb; + tp.on_chunk_user_data = user_data; + + qt_audio out = {0}; // stays empty in streaming mode + enum qt_status rc = qt_synthesize(g_ctx, &tp, &out); + qt_audio_free(&out); + if (rc != QT_STATUS_OK && rc != QT_STATUS_CANCELLED) { + fprintf(stderr, "[qwen3-tts-cpp] ERROR: stream synth failed (rc=%d): %s\n", + (int)rc, qt_last_error()); + return 3; + } + return 0; +} + +void qt3_pcm_free(float *p) { free(p); } + +void qt3_unload(void) { + if (g_ctx) { + qt_free(g_ctx); + g_ctx = nullptr; + } +} + +int qt3_n_speakers(void) { return g_ctx ? qt_n_speakers(g_ctx) : 0; } + +const char *qt3_speaker_name(int i) { + return g_ctx ? qt_speaker_name(g_ctx, i) : nullptr; +} diff --git a/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.h b/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.h index 51b1c216d..5e7dd4823 100644 --- a/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.h +++ b/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.h @@ -1,12 +1,47 @@ #pragma once -#include -#include - extern "C" { -int load_model(const char *model_dir, int n_threads); -int synthesize(const char *text, const char *ref_audio_path, const char *dst, - const char *language, float temperature, float top_p, - int top_k, float repetition_penalty, int max_audio_tokens, - int n_threads); + +// Streaming PCM chunk callback. samples is mono float PCM at 24 kHz, valid +// only for the duration of the call. Return non-zero to continue, 0 to abort. +typedef int (*qt3_chunk_cb)(const float *samples, int n_samples, + void *user_data); + +// Load the talker + codec/tokenizer GGUFs. use_fa / clamp_fp16 map to +// qt_init_params (the qt ABI exposes no thread count; ggml uses its own +// default). Returns 0 on success, non-zero on failure. +int qt3_load(const char *talker_path, const char *codec_path, int use_fa, + int clamp_fp16); + +// Synthesize to a malloc'd float PCM buffer (caller frees via qt3_pcm_free). +// The synthesis mode (base / custom_voice / voice_design) is auto-detected by +// qt from the talker GGUF; speaker is honoured only for custom_voice, instruct +// for voice_design / custom_voice, and ref_samples (+ optional ref_text) drive +// base-mode cloning. qt enforces the rules and we surface qt_last_error() on +// QT_STATUS_MODE_INVALID. Writes the sample count to *out_n. Returns NULL on +// failure (out_n set to 0). +float *qt3_tts(const char *text, const char *lang, const char *instruct, + const char *speaker, const float *ref_samples, int ref_n, + const char *ref_text, long long seed, float temperature, + int top_k, float top_p, float repetition_penalty, + int max_new_tokens, int *out_n); + +// Streaming synthesis: cb is invoked per PCM chunk as audio is produced. Same +// param semantics as qt3_tts. Returns 0 on success. +int qt3_tts_stream(const char *text, const char *lang, const char *instruct, + const char *speaker, const float *ref_samples, int ref_n, + const char *ref_text, long long seed, float temperature, + int top_k, float top_p, float repetition_penalty, + int max_new_tokens, qt3_chunk_cb cb, void *user_data); + +// Free a buffer returned by qt3_tts. +void qt3_pcm_free(float *p); + +// Release the qt context. +void qt3_unload(void); + +// Named-speaker introspection (custom_voice models). Returns 0 / NULL when no +// model is loaded or the index is out of range. +int qt3_n_speakers(void); +const char *qt3_speaker_name(int i); } diff --git a/backend/go/qwen3-tts-cpp/e2e_test.go b/backend/go/qwen3-tts-cpp/e2e_test.go new file mode 100644 index 000000000..eea64d18c --- /dev/null +++ b/backend/go/qwen3-tts-cpp/e2e_test.go @@ -0,0 +1,95 @@ +package main + +import ( + "math" + "os" + "strings" + + "github.com/ebitengine/purego" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func ttsReq(text, voice string, lang *string, dst string) *pb.TTSRequest { + return &pb.TTSRequest{Text: text, Voice: voice, Language: lang, Dst: dst} +} + +var _ = Describe("qwen3-tts-cpp e2e", Label("e2e"), func() { + var loaded bool + + BeforeEach(func() { + modelPath := os.Getenv("QWEN3TTS_MODEL") + codecPath := os.Getenv("QWEN3TTS_CODEC") + if modelPath == "" || codecPath == "" { + Skip("QWEN3TTS_MODEL / QWEN3TTS_CODEC not set; skipping e2e") + } + if !loaded { + lib := os.Getenv("QWEN3TTS_LIBRARY") + if lib == "" { + lib = "./libgoqwen3ttscpp-fallback.so" + } + h, err := purego.Dlopen(lib, purego.RTLD_NOW|purego.RTLD_GLOBAL) + Expect(err).ToNot(HaveOccurred()) + purego.RegisterLibFunc(&CppLoad, h, "qt3_load") + purego.RegisterLibFunc(&CppTTS, h, "qt3_tts") + purego.RegisterLibFunc(&CppTTSStream, h, "qt3_tts_stream") + purego.RegisterLibFunc(&CppPCMFree, h, "qt3_pcm_free") + purego.RegisterLibFunc(&CppUnload, h, "qt3_unload") + Expect(CppLoad(modelPath, codecPath, 1, 0)).To(Equal(0)) + loaded = true + } + }) + + It("synthesizes a WAV file via TTS", func() { + b := &Qwen3TtsCpp{opts: loadOptions{seed: 42, useFA: true}} + dst := GinkgoT().TempDir() + "/out.wav" + lang := "english" + err := b.TTS(ttsReq("Hello world.", "", &lang, dst)) + Expect(err).ToNot(HaveOccurred()) + fi, err := os.Stat(dst) + Expect(err).ToNot(HaveOccurred()) + Expect(fi.Size()).To(BeNumerically(">", int64(44))) + }) + + It("streams audio chunks via TTSStream", func() { + b := &Qwen3TtsCpp{opts: loadOptions{seed: 42, useFA: true}} + results := make(chan []byte, 1024) + lang := "english" + done := make(chan error, 1) + go func() { done <- b.TTSStream(ttsReq("Hello there, streaming test.", "", &lang, ""), results) }() + + var chunks int + var first []byte + for c := range results { + if chunks == 0 { + first = c + } + chunks++ + } + Expect(<-done).ToNot(HaveOccurred()) + Expect(chunks).To(BeNumerically(">=", 2)) + Expect(string(first[0:4])).To(Equal("RIFF")) + Expect(strings.HasPrefix(string(first[8:12]), "WAVE")).To(BeTrue()) + }) + + It("clones a voice from the config audio_path reference", func() { + // 1s of 24kHz mono audio as a clone reference; the base model carries + // a speaker encoder, so audio_path drives x-vector voice cloning. + ref := GinkgoT().TempDir() + "/ref.wav" + samples := make([]float32, qwen3ttsSampleRate) + for i := range samples { + samples[i] = float32(0.05 * math.Sin(float64(i)*0.06)) + } + Expect(writeWAV24k(ref, samples)).To(Succeed()) + + b := &Qwen3TtsCpp{opts: loadOptions{seed: 42, useFA: true}, audioPath: ref} + dst := GinkgoT().TempDir() + "/clone.wav" + lang := "english" + // Empty Voice -> the config audio_path is used as the clone reference. + Expect(b.TTS(ttsReq("Cloned voice test.", "", &lang, dst))).To(Succeed()) + fi, err := os.Stat(dst) + Expect(err).ToNot(HaveOccurred()) + Expect(fi.Size()).To(BeNumerically(">", int64(44))) + }) +}) diff --git a/backend/go/qwen3-tts-cpp/goqwen3ttscpp.go b/backend/go/qwen3-tts-cpp/goqwen3ttscpp.go index d34c2b746..e8b3b0feb 100644 --- a/backend/go/qwen3-tts-cpp/goqwen3ttscpp.go +++ b/backend/go/qwen3-tts-cpp/goqwen3ttscpp.go @@ -5,108 +5,225 @@ import ( "os" "path/filepath" "strings" + "sync" + "unsafe" + "github.com/ebitengine/purego" "github.com/mudler/LocalAI/pkg/grpc/base" pb "github.com/mudler/LocalAI/pkg/grpc/proto" ) var ( - CppLoadModel func(modelDir string, nThreads int) int - CppSynthesize func(text, refAudioPath, dst, language string, - temperature, topP float32, topK int, - repetitionPenalty float32, maxAudioTokens, nThreads int) int + // qt3_load(talker_path, codec_path, use_fa, clamp_fp16) int + CppLoad func(talkerPath, codecPath string, useFA, clampFP16 int) int + // qt3_tts(text, lang, instruct, speaker, ref_samples, ref_n, ref_text, + // seed, temperature, top_k, top_p, rep_pen, max_new, out_n) -> float* + CppTTS func(text, lang, instruct, speaker string, refSamples unsafe.Pointer, + refN int, refText string, seed int64, temperature float32, topK int, + topP, repPen float32, maxNew int, outN unsafe.Pointer) uintptr + // qt3_tts_stream(..., cb, user) int + CppTTSStream func(text, lang, instruct, speaker string, refSamples unsafe.Pointer, + refN int, refText string, seed int64, temperature float32, topK int, + topP, repPen float32, maxNew int, cb uintptr, user uintptr) int + CppPCMFree func(ptr uintptr) + CppUnload func() ) type Qwen3TtsCpp struct { base.SingleThread - threads int -} - -// languageNameAliases maps common full language names to the canonical -// two-letter code understood by the C++ language_to_id table. -var languageNameAliases = map[string]string{ - "english": "en", - "russian": "ru", - "chinese": "zh", - "japanese": "ja", - "korean": "ko", - "german": "de", - "french": "fr", - "spanish": "es", - "italian": "it", - "portuguese": "pt", -} - -// normalizeLanguage coerces a caller-supplied language into the canonical code -// the model expects. It lowercases, trims, strips any region/locale suffix -// (en-US, en_US, ja.JP -> en/ja), and resolves common full names (english -> en). -// An empty input stays empty so the C++ side applies its English default; an -// unrecognized value is returned normalized so C++ can log it and default. -func normalizeLanguage(lang string) string { - lang = strings.ToLower(strings.TrimSpace(lang)) - if lang == "" { - return "" - } - - // Strip region/locale suffix: keep the segment before the first separator. - if i := strings.IndexAny(lang, "-_."); i >= 0 { - lang = lang[:i] - } - - if code, ok := languageNameAliases[lang]; ok { - return code - } - return lang + opts loadOptions + // audioPath is the model-config reference voice (tts.audio_path), the + // default clone reference when a request omits an audio Voice. + audioPath string } func (q *Qwen3TtsCpp) Load(opts *pb.ModelOptions) error { - // ModelFile is the model directory path (containing GGUF files) - modelDir := opts.ModelFile - if modelDir == "" { - modelDir = opts.ModelPath + model := opts.ModelFile + if model == "" { + model = opts.ModelPath + } + if !filepath.IsAbs(model) && opts.ModelPath != "" { + model = filepath.Join(opts.ModelPath, model) } - // Resolve relative paths - if !filepath.IsAbs(modelDir) && opts.ModelPath != "" { - modelDir = filepath.Join(opts.ModelPath, modelDir) + q.opts = parseOptions(opts.Options) + + // Resolve the codec/tokenizer GGUF: explicit option, else auto-discover a + // *tokenizer*.gguf sibling of the talker model. + codec := q.opts.codecPath + if codec != "" && !filepath.IsAbs(codec) { + codec = filepath.Join(filepath.Dir(model), codec) + } + if codec == "" { + codec = discoverTokenizer(filepath.Dir(model)) + } + if codec == "" { + return fmt.Errorf("qwen3-tts: no codec/tokenizer GGUF found; set option 'tokenizer:'") + } + q.opts.codecPath = codec + + q.audioPath = opts.AudioPath + if q.audioPath != "" && !filepath.IsAbs(q.audioPath) { + q.audioPath = filepath.Join(filepath.Dir(model), q.audioPath) } - threads := int(opts.Threads) - if threads <= 0 { - threads = 4 + useFA := boolToInt(q.opts.useFA) + clamp := boolToInt(q.opts.clampFP16) + + fmt.Fprintf(os.Stderr, "[qwen3-tts-cpp] Load talker=%s codec=%s use_fa=%d clamp_fp16=%d\n", + model, codec, useFA, clamp) + + if rc := CppLoad(model, codec, useFA, clamp); rc != 0 { + return fmt.Errorf("qwen3-tts: failed to load model (rc=%d)", rc) } - q.threads = threads - - fmt.Fprintf(os.Stderr, "[qwen3-tts-cpp] Loading models from: %s (threads=%d)\n", modelDir, threads) - - if ret := CppLoadModel(modelDir, threads); ret != 0 { - return fmt.Errorf("failed to load qwen3-tts model (error code: %d)", ret) - } - return nil } +// discoverTokenizer returns the first *tokenizer*.gguf in dir, or "". +func discoverTokenizer(dir string) string { + entries, err := os.ReadDir(dir) + if err != nil { + return "" + } + for _, e := range entries { + name := strings.ToLower(e.Name()) + if strings.Contains(name, "tokenizer") && strings.HasSuffix(name, ".gguf") { + return filepath.Join(dir, e.Name()) + } + } + return "" +} + +func boolToInt(b bool) int { + if b { + return 1 + } + return 0 +} + +func optStr(p *string) string { + if p == nil { + return "" + } + return *p +} + +// resolveRequest derives the synthesis inputs from a TTSRequest: +// language, instruct, speaker, ref-audio samples, ref-text and sampling. +func (q *Qwen3TtsCpp) resolveRequest(req *pb.TTSRequest) (lang, instruct, speaker, refText string, ref []float32, s sampling, err error) { + lang = normalizeLanguage(optStr(req.Language)) + instruct = optStr(req.Instructions) + + var refPath string + speaker, refPath = resolveVoice(req.Voice) + if refPath == "" && speaker == "" && q.audioPath != "" { + // No per-request voice: fall back to the config clone reference. + refPath = q.audioPath + } + if refPath != "" { + ref, err = readWAVAsFloat(refPath) + if err != nil { + return + } + } + + if req.Params != nil { + refText = req.Params["ref_text"] + } + s = parseSampling(req.Params, q.opts.seed) + return +} + func (q *Qwen3TtsCpp) TTS(req *pb.TTSRequest) error { - text := req.Text - voice := req.Voice // reference audio path for voice cloning (empty = no cloning) - dst := req.Dst - language := "" - if req.Language != nil { - language = normalizeLanguage(*req.Language) + if req.Dst == "" { + return fmt.Errorf("qwen3-tts: TTS requires a destination path") + } + if req.Text == "" { + return fmt.Errorf("qwen3-tts: TTS requires text") + } + lang, instruct, speaker, refText, ref, s, err := q.resolveRequest(req) + if err != nil { + return err + } + var refPtr unsafe.Pointer + if len(ref) > 0 { + refPtr = unsafe.Pointer(&ref[0]) } - // Synthesis parameters with sensible defaults - temperature := float32(0.9) - topP := float32(0.8) - topK := 50 - repetitionPenalty := float32(1.05) - maxAudioTokens := 4096 + var n int32 + ptr := CppTTS(req.Text, lang, instruct, speaker, refPtr, len(ref), refText, + s.seed, s.temperature, s.topK, s.topP, s.repPen, s.maxNew, unsafe.Pointer(&n)) + runtimeKeepAlive(ref) + if ptr == 0 { + return fmt.Errorf("qwen3-tts: synthesis failed") + } + // Register the free as soon as we own a non-null buffer, so the n<=0 guard + // below cannot leak it (defensive: the C contract returns NULL on failure). + defer CppPCMFree(ptr) + if n <= 0 { + return fmt.Errorf("qwen3-tts: synthesis produced no samples") + } + src := unsafe.Slice((*float32)(unsafe.Pointer(ptr)), int(n)) //nolint:govet // C-allocated PCM, copied out before free + out := make([]float32, int(n)) + copy(out, src) + return writeWAV24k(req.Dst, out) +} - if ret := CppSynthesize(text, voice, dst, language, - temperature, topP, topK, repetitionPenalty, - maxAudioTokens, q.threads); ret != 0 { - return fmt.Errorf("failed to synthesize audio (error code: %d)", ret) +// streamState carries the active TTSStream channel to the single shared C +// callback. base.SingleThread serializes TTS/TTSStream, so one global slot is +// safe and avoids leaking a purego callback per request (purego callbacks +// cannot be freed and are capped). +var ( + streamMu sync.Mutex + streamChan chan []byte + streamCbOnce sync.Once + streamCbPtr uintptr +) + +// streamCallback is registered once and forwards each PCM chunk to streamChan. +func streamCallback(samples *float32, nSamples int32, _ uintptr) uintptr { + if nSamples <= 0 || samples == nil || streamChan == nil { + return 1 // continue + } + src := unsafe.Slice(samples, int(nSamples)) + cp := make([]float32, int(nSamples)) // copy out of C memory before returning + copy(cp, src) + streamChan <- floatToPCM16LE(cp) + return 1 // continue +} + +func (q *Qwen3TtsCpp) TTSStream(req *pb.TTSRequest, results chan []byte) error { + defer close(results) + if req.Text == "" { + return fmt.Errorf("qwen3-tts: TTSStream requires text") } + streamCbOnce.Do(func() { + streamCbPtr = purego.NewCallback(streamCallback) + }) + + lang, instruct, speaker, refText, ref, s, err := q.resolveRequest(req) + if err != nil { + return err + } + var refPtr unsafe.Pointer + if len(ref) > 0 { + refPtr = unsafe.Pointer(&ref[0]) + } + + // Emit the WAV header first so the HTTP layer gets a self-describing stream. + results <- wavHeader24k() + + streamMu.Lock() + streamChan = results + rc := CppTTSStream(req.Text, lang, instruct, speaker, refPtr, len(ref), refText, + s.seed, s.temperature, s.topK, s.topP, s.repPen, s.maxNew, streamCbPtr, 0) + streamChan = nil + streamMu.Unlock() + runtimeKeepAlive(ref) + + if rc != 0 { + return fmt.Errorf("qwen3-tts: streaming synthesis failed (rc=%d)", rc) + } return nil } diff --git a/backend/go/qwen3-tts-cpp/language_test.go b/backend/go/qwen3-tts-cpp/language_test.go deleted file mode 100644 index 9c3526669..000000000 --- a/backend/go/qwen3-tts-cpp/language_test.go +++ /dev/null @@ -1,53 +0,0 @@ -package main - -import ( - "testing" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" -) - -func TestLanguageNormalization(t *testing.T) { - RegisterFailHandler(Fail) - RunSpecs(t, "qwen3-tts-cpp language normalization") -} - -var _ = Describe("normalizeLanguage", func() { - DescribeTable("maps caller input to the canonical model language code", - func(input, expected string) { - Expect(normalizeLanguage(input)).To(Equal(expected)) - }, - // Canonical codes pass through unchanged - Entry("canonical en", "en", "en"), - Entry("canonical zh", "zh", "zh"), - Entry("canonical pt", "pt", "pt"), - - // Case-insensitive - Entry("uppercase", "EN", "en"), - Entry("mixed case", "Ja", "ja"), - - // Surrounding whitespace - Entry("trims whitespace", " en ", "en"), - - // Region/locale stripping - Entry("BCP-47 region", "en-US", "en"), - Entry("underscore region", "en_US", "en"), - Entry("dotted locale", "ja.JP", "ja"), - Entry("region + case", "ZH-CN", "zh"), - - // Full-name aliases - Entry("english name", "english", "en"), - Entry("chinese name cased", "Chinese", "zh"), - Entry("japanese name", "japanese", "ja"), - Entry("russian name", "russian", "ru"), - Entry("portuguese name", "portuguese", "pt"), - - // Empty stays empty (C++ applies the English default) - Entry("empty", "", ""), - Entry("whitespace only", " ", ""), - - // Unknown values pass through normalized so C++ can log + default - Entry("unknown code", "klingon", "klingon"), - Entry("unknown with region", "xx-YY", "xx"), - ) -}) diff --git a/backend/go/qwen3-tts-cpp/main.go b/backend/go/qwen3-tts-cpp/main.go index d10239ccc..b788229cd 100644 --- a/backend/go/qwen3-tts-cpp/main.go +++ b/backend/go/qwen3-tts-cpp/main.go @@ -19,24 +19,25 @@ type LibFuncs struct { } func main() { - // Get library name from environment variable, default to fallback libName := os.Getenv("QWEN3TTS_LIBRARY") if libName == "" { libName = "./libgoqwen3ttscpp-fallback.so" } - gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL) + lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL) if err != nil { panic(err) } libFuncs := []LibFuncs{ - {&CppLoadModel, "load_model"}, - {&CppSynthesize, "synthesize"}, + {&CppLoad, "qt3_load"}, + {&CppTTS, "qt3_tts"}, + {&CppTTSStream, "qt3_tts_stream"}, + {&CppPCMFree, "qt3_pcm_free"}, + {&CppUnload, "qt3_unload"}, } - for _, lf := range libFuncs { - purego.RegisterLibFunc(lf.FuncPtr, gosd, lf.Name) + purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name) } flag.Parse() diff --git a/backend/go/qwen3-tts-cpp/options.go b/backend/go/qwen3-tts-cpp/options.go new file mode 100644 index 000000000..67837db0e --- /dev/null +++ b/backend/go/qwen3-tts-cpp/options.go @@ -0,0 +1,161 @@ +package main + +import ( + "strconv" + "strings" +) + +// loadOptions holds the parsed model-level options. +type loadOptions struct { + codecPath string + useFA bool + clampFP16 bool + seed int64 +} + +// sampling holds per-request generation parameters with qt defaults applied. +type sampling struct { + temperature float32 + topK int + topP float32 + repPen float32 + maxNew int + seed int64 +} + +func splitOption(o string) (key, value string, ok bool) { + i := strings.Index(o, ":") + if i < 0 { + return "", "", false + } + return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true +} + +func parseBool(v string) bool { return v == "true" || v == "1" } + +// parseOptions reads the backend "key:value" option slice. Unknown keys are +// ignored. Defaults: use_fa true (qt default; CPU still uses the F32 chain), +// seed -1 (engine random). +func parseOptions(opts []string) loadOptions { + o := loadOptions{useFA: true, seed: -1} + for _, oo := range opts { + key, value, ok := splitOption(oo) + if !ok { + continue + } + switch key { + case "tokenizer", "codec": + o.codecPath = value + case "use_fa": + o.useFA = parseBool(value) + case "clamp_fp16": + o.clampFP16 = parseBool(value) + case "seed": + if n, err := strconv.ParseInt(value, 10, 64); err == nil { + o.seed = n + } + } + } + return o +} + +// languageAliases maps codes / locales / full names to the upstream qwentts +// language names. "auto" (and empty) map to "" so the engine auto-detects. +var languageAliases = map[string]string{ + "en": "english", "english": "english", + "zh": "chinese", "chinese": "chinese", "mandarin": "chinese", + "ja": "japanese", "japanese": "japanese", + "ko": "korean", "korean": "korean", + "de": "german", "german": "german", + "fr": "french", "french": "french", + "es": "spanish", "spanish": "spanish", + "it": "italian", "italian": "italian", + "pt": "portuguese", "portuguese": "portuguese", + "ru": "russian", "russian": "russian", + "auto": "", +} + +// normalizeLanguage lowercases, trims, strips a region/locale suffix +// (en-US -> en), and resolves to the qwentts language name. Empty stays empty +// (engine auto-detects); an unknown value passes through normalized. +func normalizeLanguage(lang string) string { + lang = strings.ToLower(strings.TrimSpace(lang)) + if lang == "" { + return "" + } + if i := strings.IndexAny(lang, "-_."); i >= 0 { + lang = lang[:i] + } + if v, ok := languageAliases[lang]; ok { + return v + } + return lang +} + +var refAudioExts = []string{".wav", ".flac", ".mp3", ".ogg", ".m4a"} + +// resolveVoice interprets the request Voice field: a value ending in a known +// audio extension is a clone-reference path; anything else is a named speaker +// (custom_voice). Empty input yields no speaker and no reference. +func resolveVoice(voice string) (speaker, refPath string) { + v := strings.TrimSpace(voice) + if v == "" { + return "", "" + } + lower := strings.ToLower(v) + for _, ext := range refAudioExts { + if strings.HasSuffix(lower, ext) { + return "", v + } + } + return v, "" +} + +func parseFloat32(v string, def float32) float32 { + if v == "" { + return def + } + f, err := strconv.ParseFloat(v, 32) + if err != nil { + return def + } + return float32(f) +} + +func parseInt(v string, def int) int { + if v == "" { + return def + } + n, err := strconv.Atoi(v) + if err != nil { + return def + } + return n +} + +func parseInt64(v string, def int64) int64 { + if v == "" { + return def + } + n, err := strconv.ParseInt(v, 10, 64) + if err != nil { + return def + } + return n +} + +// parseSampling reads per-request sampling params from the TTSRequest params +// map, applying qt defaults (matching qt_tts_default_params). +func parseSampling(params map[string]string, defaultSeed int64) sampling { + s := sampling{temperature: 0.9, topK: 50, topP: 1.0, repPen: 1.05, maxNew: 2048, seed: defaultSeed} + if params == nil { + return s + } + s.temperature = parseFloat32(params["temperature"], s.temperature) + s.topK = parseInt(params["top_k"], s.topK) + s.topP = parseFloat32(params["top_p"], s.topP) + s.repPen = parseFloat32(params["repetition_penalty"], s.repPen) + s.maxNew = parseInt(params["max_new_tokens"], s.maxNew) + s.seed = parseInt64(params["seed"], s.seed) + return s +} diff --git a/backend/go/qwen3-tts-cpp/qwen3ttscpp_test.go b/backend/go/qwen3-tts-cpp/qwen3ttscpp_test.go index 8e6b38610..5da2b01a0 100644 --- a/backend/go/qwen3-tts-cpp/qwen3ttscpp_test.go +++ b/backend/go/qwen3-tts-cpp/qwen3ttscpp_test.go @@ -1,173 +1,136 @@ package main import ( - "context" - "os" - "os/exec" - "path/filepath" + "bytes" + "encoding/binary" "testing" - "time" - pb "github.com/mudler/LocalAI/pkg/grpc/proto" - "google.golang.org/grpc" - "google.golang.org/grpc/credentials/insecure" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" ) -const ( - testAddr = "localhost:50051" - startupWait = 5 * time.Second -) - -func skipIfNoModel(t *testing.T) string { - t.Helper() - modelDir := os.Getenv("QWEN3TTS_MODEL_DIR") - if modelDir == "" { - t.Skip("QWEN3TTS_MODEL_DIR not set, skipping test (set to directory with GGUF models)") - } - if _, err := os.Stat(filepath.Join(modelDir, "qwen3-tts-0.6b-f16.gguf")); os.IsNotExist(err) { - t.Skipf("TTS model file not found in %s, skipping", modelDir) - } - if _, err := os.Stat(filepath.Join(modelDir, "qwen3-tts-tokenizer-f16.gguf")); os.IsNotExist(err) { - t.Skipf("Tokenizer model file not found in %s, skipping", modelDir) - } - return modelDir +func TestQwen3TtsCpp(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "qwen3-tts-cpp suite") } -func startServer(t *testing.T) *exec.Cmd { - t.Helper() - binary := os.Getenv("QWEN3TTS_BINARY") - if binary == "" { - binary = "./qwen3-tts-cpp" - } - if _, err := os.Stat(binary); os.IsNotExist(err) { - t.Skipf("Backend binary not found at %s, skipping", binary) - } - cmd := exec.Command(binary, "--addr", testAddr) - cmd.Stdout = os.Stderr - cmd.Stderr = os.Stderr - if err := cmd.Start(); err != nil { - t.Fatalf("Failed to start server: %v", err) - } - time.Sleep(startupWait) - return cmd -} - -func stopServer(cmd *exec.Cmd) { - if cmd != nil && cmd.Process != nil { - cmd.Process.Kill() - cmd.Wait() - } -} - -func dialGRPC(t *testing.T) *grpc.ClientConn { - t.Helper() - conn, err := grpc.Dial(testAddr, - grpc.WithTransportCredentials(insecure.NewCredentials()), - grpc.WithDefaultCallOptions( - grpc.MaxCallRecvMsgSize(50*1024*1024), - grpc.MaxCallSendMsgSize(50*1024*1024), - ), +var _ = Describe("normalizeLanguage", func() { + DescribeTable("maps caller language to qwentts language names", + func(in, want string) { + Expect(normalizeLanguage(in)).To(Equal(want)) + }, + Entry("empty stays empty", "", ""), + Entry("auto maps to empty", "auto", ""), + Entry("english full name", "English", "english"), + Entry("english code", "en", "english"), + Entry("locale suffix stripped", "en-US", "english"), + Entry("underscore locale", "zh_CN", "chinese"), + Entry("mandarin alias", "mandarin", "chinese"), + Entry("japanese already full", "japanese", "japanese"), + Entry("unknown passes through normalized", "xx", "xx"), ) - if err != nil { - t.Fatalf("Failed to dial gRPC: %v", err) - } - return conn -} +}) -func TestServerHealth(t *testing.T) { - cmd := startServer(t) - defer stopServer(cmd) - - conn := dialGRPC(t) - defer conn.Close() - - client := pb.NewBackendClient(conn) - resp, err := client.Health(context.Background(), &pb.HealthMessage{}) - if err != nil { - t.Fatalf("Health check failed: %v", err) - } - if string(resp.Message) != "OK" { - t.Fatalf("Expected OK, got %s", string(resp.Message)) - } -} - -func TestLoadModel(t *testing.T) { - modelDir := skipIfNoModel(t) - cmd := startServer(t) - defer stopServer(cmd) - - conn := dialGRPC(t) - defer conn.Close() - - client := pb.NewBackendClient(conn) - - resp, err := client.LoadModel(context.Background(), &pb.ModelOptions{ - ModelFile: modelDir, - Threads: 4, +var _ = Describe("resolveVoice", func() { + It("treats a bare token as a named speaker", func() { + sp, ref := resolveVoice("serena") + Expect(sp).To(Equal("serena")) + Expect(ref).To(BeEmpty()) }) - if err != nil { - t.Fatalf("LoadModel failed: %v", err) - } - if !resp.Success { - t.Fatalf("LoadModel returned failure: %s", resp.Message) - } -} - -func TestTTS(t *testing.T) { - modelDir := skipIfNoModel(t) - - tmpDir, err := os.MkdirTemp("", "qwen3tts-test") - if err != nil { - t.Fatal(err) - } - t.Cleanup(func() { os.RemoveAll(tmpDir) }) - - outputFile := filepath.Join(tmpDir, "output.wav") - - cmd := startServer(t) - defer stopServer(cmd) - - conn := dialGRPC(t) - defer conn.Close() - - client := pb.NewBackendClient(conn) - - // Load models - loadResp, err := client.LoadModel(context.Background(), &pb.ModelOptions{ - ModelFile: modelDir, - Threads: 4, + It("treats an audio path as a clone reference (case-insensitive ext)", func() { + sp, ref := resolveVoice("/x/ref.WAV") + Expect(sp).To(BeEmpty()) + Expect(ref).To(Equal("/x/ref.WAV")) }) - if err != nil { - t.Fatalf("LoadModel failed: %v", err) - } - if !loadResp.Success { - t.Fatalf("LoadModel returned failure: %s", loadResp.Message) - } - - // Synthesize speech - language := "en" - _, err = client.TTS(context.Background(), &pb.TTSRequest{ - Text: "Hello, this is a test of the Qwen3 text to speech system.", - Dst: outputFile, - Language: &language, + It("recognizes mp3/flac/ogg/m4a", func() { + for _, p := range []string{"a.mp3", "b.flac", "c.ogg", "d.m4a"} { + sp, ref := resolveVoice(p) + Expect(sp).To(BeEmpty()) + Expect(ref).To(Equal(p)) + } }) - if err != nil { - t.Fatalf("TTS failed: %v", err) - } + It("returns empty for empty input", func() { + sp, ref := resolveVoice(" ") + Expect(sp).To(BeEmpty()) + Expect(ref).To(BeEmpty()) + }) +}) - // Verify output file exists and has content - info, err := os.Stat(outputFile) - if os.IsNotExist(err) { - t.Fatal("Output audio file was not created") - } - if err != nil { - t.Fatalf("Failed to stat output file: %v", err) - } +var _ = Describe("parseOptions", func() { + It("extracts codec, use_fa, clamp_fp16, seed", func() { + o := parseOptions([]string{ + "tokenizer:tok.gguf", "use_fa:false", "clamp_fp16:true", + "seed:7", "unknown:ignored", + }) + Expect(o.codecPath).To(Equal("tok.gguf")) + Expect(o.useFA).To(BeFalse()) + Expect(o.clampFP16).To(BeTrue()) + Expect(o.seed).To(Equal(int64(7))) + }) + It("accepts codec: as an alias for tokenizer:", func() { + Expect(parseOptions([]string{"codec:c.gguf"}).codecPath).To(Equal("c.gguf")) + }) + It("defaults use_fa true and seed -1", func() { + o := parseOptions(nil) + Expect(o.useFA).To(BeTrue()) + Expect(o.seed).To(Equal(int64(-1))) + }) +}) - t.Logf("Output file size: %d bytes", info.Size()) +var _ = Describe("parseSampling", func() { + It("applies qt defaults when params are absent", func() { + s := parseSampling(nil, -1) + Expect(s.temperature).To(BeNumerically("~", 0.9, 1e-6)) + Expect(s.topK).To(Equal(50)) + Expect(s.topP).To(BeNumerically("~", 1.0, 1e-6)) + Expect(s.repPen).To(BeNumerically("~", 1.05, 1e-6)) + Expect(s.maxNew).To(Equal(2048)) + Expect(s.seed).To(Equal(int64(-1))) + }) + It("reads overrides and falls back to default seed", func() { + s := parseSampling(map[string]string{ + "temperature": "0.5", "top_k": "10", "top_p": "0.8", + "repetition_penalty": "1.2", "max_new_tokens": "512", + }, 99) + Expect(s.temperature).To(BeNumerically("~", 0.5, 1e-6)) + Expect(s.topK).To(Equal(10)) + Expect(s.topP).To(BeNumerically("~", 0.8, 1e-6)) + Expect(s.repPen).To(BeNumerically("~", 1.2, 1e-6)) + Expect(s.maxNew).To(Equal(512)) + Expect(s.seed).To(Equal(int64(99))) + }) + It("reads an explicit seed override", func() { + Expect(parseSampling(map[string]string{"seed": "123"}, -1).seed).To(Equal(int64(123))) + }) +}) - // WAV header is 44 bytes minimum; any real audio should be much larger - if info.Size() < 1000 { - t.Errorf("Output file too small (%d bytes), expected real audio data", info.Size()) - } -} +var _ = Describe("wavHeader24k", func() { + It("emits a 44-byte streaming WAV header at 24 kHz mono 16-bit", func() { + h := wavHeader24k() + Expect(h).To(HaveLen(44)) + Expect(string(h[0:4])).To(Equal("RIFF")) + Expect(string(h[8:12])).To(Equal("WAVE")) + Expect(string(h[12:16])).To(Equal("fmt ")) + Expect(string(h[36:40])).To(Equal("data")) + var sampleRate uint32 + Expect(binary.Read(bytes.NewReader(h[24:28]), binary.LittleEndian, &sampleRate)).To(Succeed()) + Expect(sampleRate).To(Equal(uint32(24000))) + }) +}) + +var _ = Describe("floatToPCM16LE", func() { + It("clamps and converts float PCM to little-endian int16 bytes", func() { + b := floatToPCM16LE([]float32{0, 1.0, -1.0, 2.0, -2.0}) + Expect(b).To(HaveLen(10)) + read := func(off int) int16 { + var v int16 + _ = binary.Read(bytes.NewReader(b[off:off+2]), binary.LittleEndian, &v) + return v + } + Expect(read(0)).To(Equal(int16(0))) + Expect(read(2)).To(Equal(int16(32767))) + Expect(read(4)).To(Equal(int16(-32767))) + Expect(read(6)).To(Equal(int16(32767))) // clamped from 2.0 + Expect(read(8)).To(Equal(int16(-32767))) // clamped from -2.0 + }) +}) diff --git a/backend/go/qwen3-tts-cpp/test.sh b/backend/go/qwen3-tts-cpp/test.sh index aaebfc42a..dedad82c8 100755 --- a/backend/go/qwen3-tts-cpp/test.sh +++ b/backend/go/qwen3-tts-cpp/test.sh @@ -2,51 +2,30 @@ set -e CURDIR=$(dirname "$(realpath $0)") +cd "$CURDIR" echo "Running qwen3-tts-cpp backend tests..." -# The test requires: -# - QWEN3TTS_MODEL_DIR: path to directory containing GGUF model files -# - QWEN3TTS_BINARY: path to the qwen3-tts-cpp binary (defaults to ./qwen3-tts-cpp) -# -# Tests that require the model will be skipped if QWEN3TTS_MODEL_DIR is not set -# or the directory does not contain the required model files. - -cd "$CURDIR" - -# Only auto-download models when QWEN3TTS_MODEL_DIR is not explicitly set -if [ -z "$QWEN3TTS_MODEL_DIR" ]; then - export QWEN3TTS_MODEL_DIR="./qwen3-tts-models" - - if [ ! -d "$QWEN3TTS_MODEL_DIR" ]; then - echo "Creating qwen3-tts-models directory for tests..." - mkdir -p "$QWEN3TTS_MODEL_DIR" - REPO_ID="endo5501/qwen3-tts.cpp" - echo "Repository: ${REPO_ID}" - echo "" - - # Files to download (smallest model for testing) - FILES=( - "qwen3-tts-0.6b-f16.gguf" - "qwen3-tts-tokenizer-f16.gguf" - ) - - BASE_URL="https://huggingface.co/${REPO_ID}/resolve/main" - - for file in "${FILES[@]}"; do - dest="${QWEN3TTS_MODEL_DIR}/${file}" - if [ -f "${dest}" ]; then - echo " [skip] ${file} (already exists)" - else - echo " [download] ${file}..." - curl -L -o "${dest}" "${BASE_URL}/${file}" --progress-bar - echo " [done] ${file}" - fi - done - fi +# Auto-download a small model pair only when QWEN3TTS_MODEL is not set. +if [ -z "$QWEN3TTS_MODEL" ]; then + MODEL_DIR="./qwen3-tts-models" + mkdir -p "$MODEL_DIR" + REPO_ID="Serveurperso/Qwen3-TTS-GGUF" + BASE_URL="https://huggingface.co/${REPO_ID}/resolve/main" + FILES=( "qwen-talker-0.6b-base-Q4_K_M.gguf" "qwen-tokenizer-12hz-Q4_K_M.gguf" ) + for file in "${FILES[@]}"; do + dest="${MODEL_DIR}/${file}" + if [ -f "${dest}" ]; then + echo " [skip] ${file}" + else + echo " [download] ${file}..." + curl -L -o "${dest}" "${BASE_URL}/${file}" --progress-bar + fi + done + export QWEN3TTS_MODEL="${MODEL_DIR}/qwen-talker-0.6b-base-Q4_K_M.gguf" + export QWEN3TTS_CODEC="${MODEL_DIR}/qwen-tokenizer-12hz-Q4_K_M.gguf" fi -# Run Go tests -go test -v -timeout 600s . +go test -v -timeout 1200s . echo "All qwen3-tts-cpp tests passed." diff --git a/backend/index.yaml b/backend/index.yaml index 43716a7d1..19483ab03 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -768,14 +768,17 @@ - &qwen3ttscpp name: "qwen3-tts-cpp" description: | - Qwen3-TTS C++ backend using GGML. Native C++ text-to-speech with voice cloning support. - Generates 24kHz mono audio from text with optional reference audio for voice cloning via ECAPA-TDNN speaker embeddings. + Qwen3-TTS C++ backend using GGML (qwentts.cpp). Native C++ text-to-speech + with streaming output, named speakers, voice design, and zero-shot voice + cloning. 24kHz mono, 11 languages with Mandarin dialects. 0.6B and 1.7B + models in Q8_0 / Q4_K_M. urls: - - https://github.com/predict-woo/qwen3-tts.cpp + - https://github.com/ServeurpersoCom/qwentts.cpp tags: - text-to-speech - tts - voice-cloning + - streaming alias: "qwen3-tts-cpp" capabilities: default: "cpu-qwen3-tts-cpp" diff --git a/core/config/backend_capabilities.go b/core/config/backend_capabilities.go index d714cf58d..234873ffa 100644 --- a/core/config/backend_capabilities.go +++ b/core/config/backend_capabilities.go @@ -396,10 +396,10 @@ var BackendCapabilities = map[string]BackendCapability{ Description: "Qwen TTS", }, "qwen3-tts-cpp": { - GRPCMethods: []GRPCMethod{MethodTTS}, + GRPCMethods: []GRPCMethod{MethodTTS, MethodTTSStream}, PossibleUsecases: []string{UsecaseTTS}, DefaultUsecases: []string{UsecaseTTS}, - Description: "Qwen3 TTS C++ — text-to-speech, C++ backend", + Description: "Qwen3 TTS C++ - text-to-speech with streaming, named speakers, voice design and cloning (qwentts.cpp / GGML)", }, "faster-qwen3-tts": { GRPCMethods: []GRPCMethod{MethodTTS}, diff --git a/gallery/index.yaml b/gallery/index.yaml index 1a3bf8db2..7f47ee1b2 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -3304,38 +3304,267 @@ - filename: vibevoice-cpp-asr/tokenizer.gguf sha256: 37dc3b722d5677e37e29a57df55aa05c485116eeb5459e57ff8dde616b4986f6 uri: huggingface://mudler/vibevoice.cpp-models/tokenizer.gguf -- name: qwen3-tts-cpp +- &qwenttscpp_gallery + name: qwen3-tts-cpp url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: - - https://huggingface.co/endo5501/qwen3-tts.cpp - - https://github.com/predict-woo/qwen3-tts.cpp + - https://huggingface.co/Serveurperso/Qwen3-TTS-GGUF + - https://github.com/ServeurpersoCom/qwentts.cpp description: | - Qwen3-TTS 0.6B (C++ / GGML) — native C++ text-to-speech from text input. - Generates 24kHz mono audio. Supports 10 languages (en, zh, ja, ko, de, fr, es, it, pt, ru). - Uses F16 GGUF models (~2 GB total). - license: apache-2.0 + Qwen3-TTS 0.6B Base (C++ / GGML, qwentts.cpp). Native C++ text-to-speech with + streaming output and zero-shot voice cloning (set `voice` to a 24kHz reference + .wav). 24kHz mono, 11 languages with Mandarin dialects. Q8_0 (~0.95 GB talker). + license: mit icon: https://huggingface.co/avatars/c299494fd1e72375832499c75b3425d6.svg tags: - tts - text-to-speech + - voice-cloning + - streaming - qwen3-tts - qwen3-tts-cpp - gguf - last_checked: "2026-04-30" + last_checked: "2026-06-13" overrides: backend: qwen3-tts-cpp known_usecases: - tts name: qwen3-tts-cpp parameters: - model: qwen3-tts-cpp + model: qwen3-tts-cpp/qwen-talker-0.6b-base-Q8_0.gguf files: - - filename: qwen3-tts-cpp/qwen3-tts-0.6b-f16.gguf - sha256: 0b89770118463af8f2467d824a8de57d96df6a09f927a9769a3f7b7fffa7087d - uri: huggingface://endo5501/qwen3-tts.cpp/qwen3-tts-0.6b-f16.gguf - - filename: qwen3-tts-cpp/qwen3-tts-tokenizer-f16.gguf - sha256: d1ad9660bd99343f4851d5a4b17e31f65648feb3559f6ea062ae6575e5cd9d90 - uri: huggingface://endo5501/qwen3-tts.cpp/qwen3-tts-tokenizer-f16.gguf + - filename: qwen3-tts-cpp/qwen-talker-0.6b-base-Q8_0.gguf + sha256: d54dbaf10591421fa764ed630d764efa717ae40cd959bd48c66d4eb1af226426 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-talker-0.6b-base-Q8_0.gguf + - filename: qwen3-tts-cpp/qwen-tokenizer-12hz-Q8_0.gguf + sha256: 1883beeed99348fc35e23dd225e9082f93f6f8c109330a33d935baa8acdbfd94 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-tokenizer-12hz-Q8_0.gguf +- !!merge <<: *qwenttscpp_gallery + name: qwen3-tts-cpp-0.6b-base-q4 + description: | + Qwen3-TTS 0.6B Base (C++ / GGML, qwentts.cpp), Q4_K_M (~0.6 GB talker). + Streaming + voice cloning, 24kHz mono, 11 languages. + overrides: + backend: qwen3-tts-cpp + known_usecases: + - tts + name: qwen3-tts-cpp-0.6b-base-q4 + parameters: + model: qwen3-tts-cpp-0.6b-base-q4/qwen-talker-0.6b-base-Q4_K_M.gguf + files: + - filename: qwen3-tts-cpp-0.6b-base-q4/qwen-talker-0.6b-base-Q4_K_M.gguf + sha256: 4b468ec7b1f62b90ef4ca316c0aa57deadfd54b2cf9651703ea753cedaf04226 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-talker-0.6b-base-Q4_K_M.gguf + - filename: qwen3-tts-cpp-0.6b-base-q4/qwen-tokenizer-12hz-Q4_K_M.gguf + sha256: cf3788b4d50aaa665fb6e57c170396aae03a3555fea52d2b5d0cda902d658039 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-tokenizer-12hz-Q4_K_M.gguf +- !!merge <<: *qwenttscpp_gallery + name: qwen3-tts-cpp-1.7b-base + description: | + Qwen3-TTS 1.7B Base (C++ / GGML, qwentts.cpp), Q8_0 (~2.0 GB talker). + Higher-quality streaming + voice cloning, 24kHz mono, 11 languages. + overrides: + backend: qwen3-tts-cpp + known_usecases: + - tts + name: qwen3-tts-cpp-1.7b-base + parameters: + model: qwen3-tts-cpp-1.7b-base/qwen-talker-1.7b-base-Q8_0.gguf + files: + - filename: qwen3-tts-cpp-1.7b-base/qwen-talker-1.7b-base-Q8_0.gguf + sha256: 4b9a33a236908dd9435a42f7a396e38038329d053b704342a6413c08544c4fda + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-talker-1.7b-base-Q8_0.gguf + - filename: qwen3-tts-cpp-1.7b-base/qwen-tokenizer-12hz-Q8_0.gguf + sha256: 1883beeed99348fc35e23dd225e9082f93f6f8c109330a33d935baa8acdbfd94 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-tokenizer-12hz-Q8_0.gguf +- !!merge <<: *qwenttscpp_gallery + name: qwen3-tts-cpp-1.7b-base-q4 + description: | + Qwen3-TTS 1.7B Base (C++ / GGML, qwentts.cpp), Q4_K_M (~1.2 GB talker). + Streaming + voice cloning, 24kHz mono, 11 languages. + overrides: + backend: qwen3-tts-cpp + known_usecases: + - tts + name: qwen3-tts-cpp-1.7b-base-q4 + parameters: + model: qwen3-tts-cpp-1.7b-base-q4/qwen-talker-1.7b-base-Q4_K_M.gguf + files: + - filename: qwen3-tts-cpp-1.7b-base-q4/qwen-talker-1.7b-base-Q4_K_M.gguf + sha256: ea393ebaf2167ea23ce9fc18b093822851358a950d7075cd47ab4f6ce23e887d + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-talker-1.7b-base-Q4_K_M.gguf + - filename: qwen3-tts-cpp-1.7b-base-q4/qwen-tokenizer-12hz-Q4_K_M.gguf + sha256: cf3788b4d50aaa665fb6e57c170396aae03a3555fea52d2b5d0cda902d658039 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-tokenizer-12hz-Q4_K_M.gguf +- !!merge <<: *qwenttscpp_gallery + name: qwen3-tts-cpp-customvoice + description: | + Qwen3-TTS 0.6B CustomVoice (C++ / GGML, qwentts.cpp), Q8_0. Named speakers + selected via the `voice` field: serena, vivian, uncle_fu, ryan, aiden, + ono_anna, sohee, eric (sichuan dialect), dylan (beijing dialect). Streaming, + 24kHz mono, 11 languages. + tags: + - tts + - text-to-speech + - named-speakers + - streaming + - qwen3-tts + - qwen3-tts-cpp + - gguf + overrides: + backend: qwen3-tts-cpp + known_usecases: + - tts + name: qwen3-tts-cpp-customvoice + parameters: + model: qwen3-tts-cpp-customvoice/qwen-talker-0.6b-customvoice-Q8_0.gguf + files: + - filename: qwen3-tts-cpp-customvoice/qwen-talker-0.6b-customvoice-Q8_0.gguf + sha256: 4eb38675c736ed6ac72012846ac8d6ef80e5af8bc05726870f0b3a6569588519 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-talker-0.6b-customvoice-Q8_0.gguf + - filename: qwen3-tts-cpp-customvoice/qwen-tokenizer-12hz-Q8_0.gguf + sha256: 1883beeed99348fc35e23dd225e9082f93f6f8c109330a33d935baa8acdbfd94 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-tokenizer-12hz-Q8_0.gguf +- !!merge <<: *qwenttscpp_gallery + name: qwen3-tts-cpp-customvoice-q4 + description: | + Qwen3-TTS 0.6B CustomVoice (C++ / GGML, qwentts.cpp), Q4_K_M. Named speakers + via the `voice` field (serena, vivian, ryan, aiden, eric, dylan, ...). + Streaming, 24kHz mono, 11 languages. + tags: + - tts + - text-to-speech + - named-speakers + - streaming + - qwen3-tts + - qwen3-tts-cpp + - gguf + overrides: + backend: qwen3-tts-cpp + known_usecases: + - tts + name: qwen3-tts-cpp-customvoice-q4 + parameters: + model: qwen3-tts-cpp-customvoice-q4/qwen-talker-0.6b-customvoice-Q4_K_M.gguf + files: + - filename: qwen3-tts-cpp-customvoice-q4/qwen-talker-0.6b-customvoice-Q4_K_M.gguf + sha256: b3a7e6613d80f8a703c06267fc1e94d48ce91932ab82ab6e31c50f4ca4868e1e + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-talker-0.6b-customvoice-Q4_K_M.gguf + - filename: qwen3-tts-cpp-customvoice-q4/qwen-tokenizer-12hz-Q4_K_M.gguf + sha256: cf3788b4d50aaa665fb6e57c170396aae03a3555fea52d2b5d0cda902d658039 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-tokenizer-12hz-Q4_K_M.gguf +- !!merge <<: *qwenttscpp_gallery + name: qwen3-tts-cpp-1.7b-customvoice + description: | + Qwen3-TTS 1.7B CustomVoice (C++ / GGML, qwentts.cpp), Q8_0. Named speakers via + the `voice` field (serena, vivian, ryan, aiden, eric, dylan, ...). Streaming, + 24kHz mono, 11 languages. + tags: + - tts + - text-to-speech + - named-speakers + - streaming + - qwen3-tts + - qwen3-tts-cpp + - gguf + overrides: + backend: qwen3-tts-cpp + known_usecases: + - tts + name: qwen3-tts-cpp-1.7b-customvoice + parameters: + model: qwen3-tts-cpp-1.7b-customvoice/qwen-talker-1.7b-customvoice-Q8_0.gguf + files: + - filename: qwen3-tts-cpp-1.7b-customvoice/qwen-talker-1.7b-customvoice-Q8_0.gguf + sha256: cab2cff67a0a557310febe558dc83076b28ed790e491867eb2751759f4cd89fa + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-talker-1.7b-customvoice-Q8_0.gguf + - filename: qwen3-tts-cpp-1.7b-customvoice/qwen-tokenizer-12hz-Q8_0.gguf + sha256: 1883beeed99348fc35e23dd225e9082f93f6f8c109330a33d935baa8acdbfd94 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-tokenizer-12hz-Q8_0.gguf +- !!merge <<: *qwenttscpp_gallery + name: qwen3-tts-cpp-1.7b-customvoice-q4 + description: | + Qwen3-TTS 1.7B CustomVoice (C++ / GGML, qwentts.cpp), Q4_K_M. Named speakers + via the `voice` field. Streaming, 24kHz mono, 11 languages. + tags: + - tts + - text-to-speech + - named-speakers + - streaming + - qwen3-tts + - qwen3-tts-cpp + - gguf + overrides: + backend: qwen3-tts-cpp + known_usecases: + - tts + name: qwen3-tts-cpp-1.7b-customvoice-q4 + parameters: + model: qwen3-tts-cpp-1.7b-customvoice-q4/qwen-talker-1.7b-customvoice-Q4_K_M.gguf + files: + - filename: qwen3-tts-cpp-1.7b-customvoice-q4/qwen-talker-1.7b-customvoice-Q4_K_M.gguf + sha256: cc328834a631bc08bf9f43e62fa23f8a1383d9b429864ce6690cfb172077fc4a + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-talker-1.7b-customvoice-Q4_K_M.gguf + - filename: qwen3-tts-cpp-1.7b-customvoice-q4/qwen-tokenizer-12hz-Q4_K_M.gguf + sha256: cf3788b4d50aaa665fb6e57c170396aae03a3555fea52d2b5d0cda902d658039 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-tokenizer-12hz-Q4_K_M.gguf +- !!merge <<: *qwenttscpp_gallery + name: qwen3-tts-cpp-1.7b-voicedesign + description: | + Qwen3-TTS 1.7B VoiceDesign (C++ / GGML, qwentts.cpp), Q8_0. Synthesises a + speaker from a free-text attribute instruction - REQUIRES the OpenAI + `instructions` field (e.g. "male, young adult, moderate pitch"); requests + without it are rejected. Streaming, 24kHz mono, 11 languages. + tags: + - tts + - text-to-speech + - voice-design + - streaming + - qwen3-tts + - qwen3-tts-cpp + - gguf + overrides: + backend: qwen3-tts-cpp + known_usecases: + - tts + name: qwen3-tts-cpp-1.7b-voicedesign + parameters: + model: qwen3-tts-cpp-1.7b-voicedesign/qwen-talker-1.7b-voicedesign-Q8_0.gguf + files: + - filename: qwen3-tts-cpp-1.7b-voicedesign/qwen-talker-1.7b-voicedesign-Q8_0.gguf + sha256: 575610ab1ddcca4dca6bd9a64bcd859d93bbad8764f9cab24e1dbc0c51f62276 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-talker-1.7b-voicedesign-Q8_0.gguf + - filename: qwen3-tts-cpp-1.7b-voicedesign/qwen-tokenizer-12hz-Q8_0.gguf + sha256: 1883beeed99348fc35e23dd225e9082f93f6f8c109330a33d935baa8acdbfd94 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-tokenizer-12hz-Q8_0.gguf +- !!merge <<: *qwenttscpp_gallery + name: qwen3-tts-cpp-1.7b-voicedesign-q4 + description: | + Qwen3-TTS 1.7B VoiceDesign (C++ / GGML, qwentts.cpp), Q4_K_M. Synthesises a + speaker from a free-text attribute instruction - REQUIRES the `instructions` + field. Streaming, 24kHz mono, 11 languages. + tags: + - tts + - text-to-speech + - voice-design + - streaming + - qwen3-tts + - qwen3-tts-cpp + - gguf + overrides: + backend: qwen3-tts-cpp + known_usecases: + - tts + name: qwen3-tts-cpp-1.7b-voicedesign-q4 + parameters: + model: qwen3-tts-cpp-1.7b-voicedesign-q4/qwen-talker-1.7b-voicedesign-Q4_K_M.gguf + files: + - filename: qwen3-tts-cpp-1.7b-voicedesign-q4/qwen-talker-1.7b-voicedesign-Q4_K_M.gguf + sha256: 7605ed0cc5e72059f27468c27f70c070e05d1cc0c7b1c76bfb9cba717a59eee3 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-talker-1.7b-voicedesign-Q4_K_M.gguf + - filename: qwen3-tts-cpp-1.7b-voicedesign-q4/qwen-tokenizer-12hz-Q4_K_M.gguf + sha256: cf3788b4d50aaa665fb6e57c170396aae03a3555fea52d2b5d0cda902d658039 + uri: huggingface://Serveurperso/Qwen3-TTS-GGUF/qwen-tokenizer-12hz-Q4_K_M.gguf - name: omnivoice-cpp url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: @@ -3402,39 +3631,6 @@ - filename: omnivoice-cpp-hq/omnivoice-tokenizer-BF16.gguf sha256: c2179e4cf528b19fea22a5be94c34c083877bb5fc28ac0245d2b4299a262dcec uri: huggingface://Serveurperso/OmniVoice-GGUF/omnivoice-tokenizer-BF16.gguf -- name: qwen3-tts-cpp-customvoice - url: github:mudler/LocalAI/gallery/virtual.yaml@master - urls: - - https://huggingface.co/endo5501/qwen3-tts.cpp - - https://github.com/predict-woo/qwen3-tts.cpp - description: | - Qwen3-TTS 0.6B Custom Voice (C++ / GGML) — text-to-speech with voice cloning support. - Generates 24kHz mono audio with optional reference audio for voice cloning via ECAPA-TDNN speaker embeddings. - Supports 10 languages (en, zh, ja, ko, de, fr, es, it, pt, ru). - license: apache-2.0 - icon: https://huggingface.co/avatars/c299494fd1e72375832499c75b3425d6.svg - tags: - - tts - - text-to-speech - - voice-cloning - - qwen3-tts - - qwen3-tts-cpp - - gguf - last_checked: "2026-04-30" - overrides: - backend: qwen3-tts-cpp - known_usecases: - - tts - name: qwen3-tts-cpp-customvoice - parameters: - model: qwen3-tts-cpp-customvoice - files: - - filename: qwen3-tts-cpp-customvoice/qwen3-tts-0.6b-customvoice-f16.gguf - sha256: 40b985b71be0970d41eb042488766db556cf17290aa1cff631cabfa0bd3b0431 - uri: huggingface://endo5501/qwen3-tts.cpp/qwen3-tts-0.6b-customvoice-f16.gguf - - filename: qwen3-tts-cpp-customvoice/qwen3-tts-tokenizer-f16.gguf - sha256: d1ad9660bd99343f4851d5a4b17e31f65648feb3559f6ea062ae6575e5cd9d90 - uri: huggingface://endo5501/qwen3-tts.cpp/qwen3-tts-tokenizer-f16.gguf - name: qwen3-coder-next-mxfp4_moe url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: