From a8d7d37a3c609302429a93daa2a44a55cd0aa2b6 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Wed, 6 May 2026 10:36:59 +0200 Subject: [PATCH] fix: unbreak master CI (docs, kokoros, vibevoice-cpp ABI) (#9682) * fix(docs): correct broken Hugo relrefs The Hugo build has been failing on master since the relevant pages landed: - text-generation.md:720 referenced `/docs/features/distributed-mode`, but Hugo `relref` paths are relative to the content root, not the rendered URL. Drop the `/docs/` prefix so the lookup matches the existing `features/...` form used elsewhere in the file. - audio-transform.md:144 referenced `tts.md`; the actual page is `text-to-audio.md`. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto * fix(kokoros): stub Diarize and AudioTransform Backend trait methods The recent backend.proto additions (Diarize, AudioTransform, AudioTransformStream) extended the gRPC Backend trait, breaking kokoros-grpc compilation with E0046 because the Rust implementation hadn't picked up the new methods. Add Unimplemented stubs matching the existing pattern for non-applicable RPCs in this TTS-only backend. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto * fix(vibevoice-cpp): track upstream ABI + wire 1.5B voice cloning Two recent commits in mudler/vibevoice.cpp reshaped the vv_capi_tts signature without a corresponding bump on the LocalAI side: 3bd759c "1.5b: unify into a single tts entry point" inserted a ref_audio_path parameter between voice_path and dst_wav_path. ad856bd "1.5b: multi-speaker dialog support" promoted that to a (const char* const* ref_audio_paths, int n_ref_audio_paths) pair for per-speaker conditioning. Because purego resolves symbols by name and not by signature, the build kept linking; at runtime the misaligned arguments turned the TTS->ASR closed-loop test into a SIGSEGV inside cgo. Track HEAD explicitly and bring the bridge in line with it: * Update the CppTTS purego binding to the 9-arg form. purego marshals []*byte as a **char by handing the C side the underlying array address; nil/empty maps to NULL, which matches the C contract for "no reference audio" on the realtime-0.5B path. * Add a `ref_audio` gallery option (comma-separated, repeatable) that the 1.5B path consumes for runtime voice cloning. Multiple entries are interpreted as one WAV per speaker (Speaker 0..n-1). * TTSRequest.Voice now routes by extension/shape: `.wav` or a comma-separated list goes to ref_audio_paths; anything else stays on voice_path (realtime-0.5B's pre-baked voice gguf). * Pin VIBEVOICE_CPP_VERSION to ad856bd and wire the Makefile into the existing bump_deps matrix so future upstream rolls land as reviewable PRs instead of a silent CI break. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto * refactor(vibevoice-cpp): use ModelOptions.AudioPath for 1.5B ref audio Use the existing audio_path field from ModelOptions (already plumbed through config_file's `audio_path:` YAML and consumed by other audio backends like kokoros) instead of inventing a custom `ref_audio:` Options[] string. Multi-speaker setups stay on a single comma- separated value. No behavior change beyond the gallery key name; per-call routing via TTSRequest.Voice is unchanged. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- .github/workflows/bump_deps.yaml | 4 + backend/go/vibevoice-cpp/Makefile | 7 +- backend/go/vibevoice-cpp/govibevoicecpp.go | 124 +++++++++++++++++++-- backend/rust/kokoros/src/service.rs | 24 ++++ docs/content/features/audio-transform.md | 2 +- docs/content/features/text-generation.md | 2 +- 6 files changed, 150 insertions(+), 13 deletions(-) diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index 676af410b..3be3a22dd 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -50,6 +50,10 @@ jobs: variable: "QWEN3TTS_CPP_VERSION" branch: "main" file: "backend/go/qwen3-tts-cpp/Makefile" + - repository: "mudler/vibevoice.cpp" + variable: "VIBEVOICE_CPP_VERSION" + branch: "master" + file: "backend/go/vibevoice-cpp/Makefile" runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 diff --git a/backend/go/vibevoice-cpp/Makefile b/backend/go/vibevoice-cpp/Makefile index 67eeebbca..6b061a2ea 100644 --- a/backend/go/vibevoice-cpp/Makefile +++ b/backend/go/vibevoice-cpp/Makefile @@ -6,9 +6,12 @@ GOCMD?=go GO_TAGS?= JOBS?=$(shell nproc --ignore=1) -# vibevoice.cpp version +# vibevoice.cpp version. Pinned to a commit hash and auto-bumped by +# .github/workflows/bump_deps.yaml (the matrix entry mirrors what we +# already do for ik_llama.cpp / llama.cpp / whisper.cpp). Floating on +# `master` led to silent ABI breaks reaching CI — pin it. VIBEVOICE_REPO?=https://github.com/mudler/vibevoice.cpp -VIBEVOICE_CPP_VERSION?=master +VIBEVOICE_CPP_VERSION?=ad856bda6b1311b7f3d7c4a667be43eeb8a8249a SO_TARGET?=libgovibevoicecpp.so CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF diff --git a/backend/go/vibevoice-cpp/govibevoicecpp.go b/backend/go/vibevoice-cpp/govibevoicecpp.go index 7067c162d..242f00c31 100644 --- a/backend/go/vibevoice-cpp/govibevoicecpp.go +++ b/backend/go/vibevoice-cpp/govibevoicecpp.go @@ -7,6 +7,7 @@ import ( "os" "os/exec" "path/filepath" + "runtime" "strings" laudio "github.com/mudler/LocalAI/pkg/audio" @@ -98,9 +99,18 @@ const asrMaxNewTokens = 16384 const vibevoiceSampleRate = uint32(24000) // purego-bound entry points from libgovibevoicecpp. +// +// vv_capi_tts takes a `const char* const* ref_audio_paths` array (used +// by the 1.5B variant for runtime voice cloning; the realtime-0.5B +// path leaves it NULL and uses voice_path instead). purego marshals a +// Go []*byte slice as **char by passing the underlying array's address. +// A nil/empty slice marshals to NULL, which matches the C contract for +// "no reference audio". var ( CppLoad func(ttsModel, asrModel, tokenizer, voice string, threads int32) int32 - CppTTS func(text, voicePath, dstWav string, + CppTTS func(text, voicePath string, + refAudioPaths []*byte, nRefAudioPaths int32, + dstWav string, nSteps int32, cfgScale float32, maxSpeechFrames int32, seed uint32) int32 CppASR func(srcWav string, outJSON []byte, capacity uint64, maxNewTokens int32) int32 @@ -124,6 +134,14 @@ type VibevoiceCpp struct { asrModel string tokenizer string voice string + + // refAudio is the load-time default list of reference WAVs used by + // the 1.5B model (one per speaker). Sourced from + // ModelOptions.AudioPath (config_file's `audio_path:`) — comma- + // separated for multi-speaker. Per-call TTSRequest.Voice can + // override it. Empty for the realtime-0.5B path, which conditions + // on a pre-baked voice gguf via `voice` instead. + refAudio []string } // resolvePath joins a relative path onto `relTo`. The gallery @@ -169,6 +187,25 @@ func (v *VibevoiceCpp) parseOptions(opts []string, relTo string) string { return role } +// parseRefAudio splits a comma-separated audio_path value into a +// resolved list of WAVs. The 1.5B model uses one WAV per speaker; +// callers that only need a single reference set audio_path to a single +// path. Empty / whitespace-only entries are skipped. +func parseRefAudio(audioPath, relTo string) []string { + if audioPath == "" { + return nil + } + var out []string + for _, p := range strings.Split(audioPath, ",") { + p = strings.TrimSpace(p) + if p == "" { + continue + } + out = append(out, resolvePath(p, relTo)) + } + return out +} + func (v *VibevoiceCpp) Load(opts *pb.ModelOptions) error { if opts.ModelFile == "" { return fmt.Errorf("vibevoice-cpp: ModelFile is required") @@ -189,6 +226,12 @@ func (v *VibevoiceCpp) Load(opts *pb.ModelOptions) error { } role := v.parseOptions(opts.Options, v.modelRoot) + // 1.5B reference WAVs ride on ModelOptions.AudioPath (config_file's + // `audio_path:` key) — same convention other audio backends already + // follow. Single-speaker = single path; multi-speaker = comma list, + // one WAV per Speaker N: tag in TTSRequest.text. + v.refAudio = parseRefAudio(opts.AudioPath, v.modelRoot) + // ModelFile fills the "primary" role-slot determined by `type=` // in Options (defaults to tts). The other slot stays exactly as // Options set it - so a closed-loop config with ModelFile=tts.gguf @@ -222,8 +265,8 @@ func (v *VibevoiceCpp) Load(opts *pb.ModelOptions) error { v.threads = threads fmt.Fprintf(os.Stderr, - "[vibevoice-cpp] Loading: tts=%q asr=%q tokenizer=%q voice=%q threads=%d\n", - v.ttsModel, v.asrModel, v.tokenizer, v.voice, threads) + "[vibevoice-cpp] Loading: tts=%q asr=%q tokenizer=%q voice=%q ref_audio=%v threads=%d\n", + v.ttsModel, v.asrModel, v.tokenizer, v.voice, v.refAudio, threads) if rc := CppLoad(v.ttsModel, v.asrModel, v.tokenizer, v.voice, int32(threads)); rc != 0 { return fmt.Errorf("vibevoice-cpp: vv_capi_load failed (rc=%d)", rc) @@ -241,10 +284,35 @@ func (v *VibevoiceCpp) TTS(req *pb.TTSRequest) error { return fmt.Errorf("vibevoice-cpp: TTS requires both text and dst") } - // req.Voice may be a bare filename (e.g. "voice-en-Emma.gguf") or an - // absolute path. Resolve via the same modelRoot Load() used for - // Options[] so a swap-voice request mirrors the gallery's layout. - voice := resolvePath(req.Voice, v.modelRoot) + // TTSRequest.Voice carries the per-call override. Routing depends + // on the loaded model variant: + // * realtime-0.5B → expects a baked voice .gguf (single path). + // * 1.5B → expects one or more raw 24 kHz mono .wav + // reference clips for runtime voice cloning; + // comma-separated to address multi-speaker + // dialogs (Speaker 0..n-1 follow the order). + // We pick the branch by extension / shape of the override; if no + // override is given, fall back to the load-time defaults. + voice := "" + var refAudio []string + if reqVoice := strings.TrimSpace(req.Voice); reqVoice != "" { + if isRefAudioOverride(reqVoice) { + for _, p := range strings.Split(reqVoice, ",") { + p = strings.TrimSpace(p) + if p == "" { + continue + } + refAudio = append(refAudio, resolvePath(p, v.modelRoot)) + } + } else { + voice = resolvePath(reqVoice, v.modelRoot) + } + } else { + // No per-call override. v.voice already went to vv_capi_load + // for realtime-0.5B; ref_audio is per-call only on the C ABI, + // so the gallery's `ref_audio:` defaults are re-passed here. + refAudio = append(refAudio, v.refAudio...) + } if req.Language != nil && *req.Language != "" { fmt.Fprintf(os.Stderr, @@ -257,13 +325,51 @@ func (v *VibevoiceCpp) TTS(req *pb.TTSRequest) error { defaultMaxFrames = 200 ) defaultCfg := float32(1.3) - if rc := CppTTS(text, voice, dst, - int32(defaultSteps), defaultCfg, int32(defaultMaxFrames), 0); rc != 0 { + + refPtrs, refKeep := newCStringArray(refAudio) + rc := CppTTS(text, voice, refPtrs, int32(len(refPtrs)), dst, + int32(defaultSteps), defaultCfg, int32(defaultMaxFrames), 0) + // Hold the backing buffers past the cgo call. purego marshals + // []*byte by handing the C side the underlying array address; the + // pointed-to NUL-terminated bytes must outlive the call. + runtime.KeepAlive(refKeep) + runtime.KeepAlive(refPtrs) + if rc != 0 { return fmt.Errorf("vibevoice-cpp: vv_capi_tts failed (rc=%d)", rc) } return nil } +// isRefAudioOverride decides whether a TTSRequest.Voice override should +// be routed to ref_audio_paths (1.5B path) instead of voice_path +// (realtime-0.5B). Either a comma-separated list (multi-speaker) or a +// single .wav clip qualifies; a bare voice .gguf falls through. +func isRefAudioOverride(s string) bool { + if strings.Contains(s, ",") { + return true + } + return strings.HasSuffix(strings.ToLower(s), ".wav") +} + +// newCStringArray builds the **char array vv_capi_tts expects, plus the +// keep-alive slice the caller must runtime.KeepAlive across the cgo +// call. A nil/empty input returns (nil, nil) which purego marshals to +// the C NULL pointer. +func newCStringArray(in []string) ([]*byte, [][]byte) { + if len(in) == 0 { + return nil, nil + } + keep := make([][]byte, len(in)) + ptrs := make([]*byte, len(in)) + for i, s := range in { + b := make([]byte, len(s)+1) + copy(b, s) + keep[i] = b + ptrs[i] = &b[0] + } + return ptrs, keep +} + // asrSegment matches vibevoice's JSON output: // // [{"Start":0.0,"End":2.8,"Speaker":0,"Content":"…"}, ...] diff --git a/backend/rust/kokoros/src/service.rs b/backend/rust/kokoros/src/service.rs index ddf3576bd..0bd7cc1e2 100644 --- a/backend/rust/kokoros/src/service.rs +++ b/backend/rust/kokoros/src/service.rs @@ -351,6 +351,30 @@ impl Backend for KokorosService { Err(Status::unimplemented("Not supported")) } + async fn diarize( + &self, + _: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } + + async fn audio_transform( + &self, + _: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } + + type AudioTransformStreamStream = + ReceiverStream>; + + async fn audio_transform_stream( + &self, + _: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } + async fn sound_generation( &self, _: Request, diff --git a/docs/content/features/audio-transform.md b/docs/content/features/audio-transform.md index 76c51558f..61269b409 100644 --- a/docs/content/features/audio-transform.md +++ b/docs/content/features/audio-transform.md @@ -141,7 +141,7 @@ options: ## See also -- [Text to Audio (TTS)]({{< relref "tts.md" >}}) +- [Text to Audio (TTS)]({{< relref "text-to-audio.md" >}}) - [Audio to Text]({{< relref "audio-to-text.md" >}}) - [LocalVQE upstream](https://github.com/localai-org/LocalVQE) - [DeepVQE paper (Indenbom et al., Interspeech 2023)](https://arxiv.org/abs/2306.03177) diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md index 055dd0d73..b6e073b56 100644 --- a/docs/content/features/text-generation.md +++ b/docs/content/features/text-generation.md @@ -718,7 +718,7 @@ engine_args: `engine_args.data_parallel_size > 1` combined with the `local-ai p2p-worker vllm` follower lets a single model span multiple GPU nodes. See [vLLM Multi-Node (Data-Parallel)]({{% relref -"/docs/features/distributed-mode#vllm-multi-node-data-parallel" %}}) +"features/distributed-mode#vllm-multi-node-data-parallel" %}}) for the head/follower configuration and a worked Kimi-K2.6 example. ### Transformers