mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-16 12:38:01 -04:00
fix: unbreak master CI (docs, kokoros, vibevoice-cpp ABI) (#9682)
* fix(docs): correct broken Hugo relrefs The Hugo build has been failing on master since the relevant pages landed: - text-generation.md:720 referenced `/docs/features/distributed-mode`, but Hugo `relref` paths are relative to the content root, not the rendered URL. Drop the `/docs/` prefix so the lookup matches the existing `features/...` form used elsewhere in the file. - audio-transform.md:144 referenced `tts.md`; the actual page is `text-to-audio.md`. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(kokoros): stub Diarize and AudioTransform Backend trait methods The recent backend.proto additions (Diarize, AudioTransform, AudioTransformStream) extended the gRPC Backend trait, breaking kokoros-grpc compilation with E0046 because the Rust implementation hadn't picked up the new methods. Add Unimplemented stubs matching the existing pattern for non-applicable RPCs in this TTS-only backend. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(vibevoice-cpp): track upstream ABI + wire 1.5B voice cloning Two recent commits in mudler/vibevoice.cpp reshaped the vv_capi_tts signature without a corresponding bump on the LocalAI side: 3bd759c "1.5b: unify into a single tts entry point" inserted a ref_audio_path parameter between voice_path and dst_wav_path. ad856bd "1.5b: multi-speaker dialog support" promoted that to a (const char* const* ref_audio_paths, int n_ref_audio_paths) pair for per-speaker conditioning. Because purego resolves symbols by name and not by signature, the build kept linking; at runtime the misaligned arguments turned the TTS->ASR closed-loop test into a SIGSEGV inside cgo. Track HEAD explicitly and bring the bridge in line with it: * Update the CppTTS purego binding to the 9-arg form. purego marshals []*byte as a **char by handing the C side the underlying array address; nil/empty maps to NULL, which matches the C contract for "no reference audio" on the realtime-0.5B path. * Add a `ref_audio` gallery option (comma-separated, repeatable) that the 1.5B path consumes for runtime voice cloning. Multiple entries are interpreted as one WAV per speaker (Speaker 0..n-1). * TTSRequest.Voice now routes by extension/shape: `.wav` or a comma-separated list goes to ref_audio_paths; anything else stays on voice_path (realtime-0.5B's pre-baked voice gguf). * Pin VIBEVOICE_CPP_VERSION to ad856bd and wire the Makefile into the existing bump_deps matrix so future upstream rolls land as reviewable PRs instead of a silent CI break. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactor(vibevoice-cpp): use ModelOptions.AudioPath for 1.5B ref audio Use the existing audio_path field from ModelOptions (already plumbed through config_file's `audio_path:` YAML and consumed by other audio backends like kokoros) instead of inventing a custom `ref_audio:` Options[] string. Multi-speaker setups stay on a single comma- separated value. No behavior change beyond the gallery key name; per-call routing via TTSRequest.Voice is unchanged. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
4
.github/workflows/bump_deps.yaml
vendored
4
.github/workflows/bump_deps.yaml
vendored
@@ -50,6 +50,10 @@ jobs:
|
||||
variable: "QWEN3TTS_CPP_VERSION"
|
||||
branch: "main"
|
||||
file: "backend/go/qwen3-tts-cpp/Makefile"
|
||||
- repository: "mudler/vibevoice.cpp"
|
||||
variable: "VIBEVOICE_CPP_VERSION"
|
||||
branch: "master"
|
||||
file: "backend/go/vibevoice-cpp/Makefile"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
@@ -6,9 +6,12 @@ GOCMD?=go
|
||||
GO_TAGS?=
|
||||
JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# vibevoice.cpp version
|
||||
# vibevoice.cpp version. Pinned to a commit hash and auto-bumped by
|
||||
# .github/workflows/bump_deps.yaml (the matrix entry mirrors what we
|
||||
# already do for ik_llama.cpp / llama.cpp / whisper.cpp). Floating on
|
||||
# `master` led to silent ABI breaks reaching CI — pin it.
|
||||
VIBEVOICE_REPO?=https://github.com/mudler/vibevoice.cpp
|
||||
VIBEVOICE_CPP_VERSION?=master
|
||||
VIBEVOICE_CPP_VERSION?=ad856bda6b1311b7f3d7c4a667be43eeb8a8249a
|
||||
SO_TARGET?=libgovibevoicecpp.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
laudio "github.com/mudler/LocalAI/pkg/audio"
|
||||
@@ -98,9 +99,18 @@ const asrMaxNewTokens = 16384
|
||||
const vibevoiceSampleRate = uint32(24000)
|
||||
|
||||
// purego-bound entry points from libgovibevoicecpp.
|
||||
//
|
||||
// vv_capi_tts takes a `const char* const* ref_audio_paths` array (used
|
||||
// by the 1.5B variant for runtime voice cloning; the realtime-0.5B
|
||||
// path leaves it NULL and uses voice_path instead). purego marshals a
|
||||
// Go []*byte slice as **char by passing the underlying array's address.
|
||||
// A nil/empty slice marshals to NULL, which matches the C contract for
|
||||
// "no reference audio".
|
||||
var (
|
||||
CppLoad func(ttsModel, asrModel, tokenizer, voice string, threads int32) int32
|
||||
CppTTS func(text, voicePath, dstWav string,
|
||||
CppTTS func(text, voicePath string,
|
||||
refAudioPaths []*byte, nRefAudioPaths int32,
|
||||
dstWav string,
|
||||
nSteps int32, cfgScale float32, maxSpeechFrames int32, seed uint32) int32
|
||||
CppASR func(srcWav string, outJSON []byte, capacity uint64,
|
||||
maxNewTokens int32) int32
|
||||
@@ -124,6 +134,14 @@ type VibevoiceCpp struct {
|
||||
asrModel string
|
||||
tokenizer string
|
||||
voice string
|
||||
|
||||
// refAudio is the load-time default list of reference WAVs used by
|
||||
// the 1.5B model (one per speaker). Sourced from
|
||||
// ModelOptions.AudioPath (config_file's `audio_path:`) — comma-
|
||||
// separated for multi-speaker. Per-call TTSRequest.Voice can
|
||||
// override it. Empty for the realtime-0.5B path, which conditions
|
||||
// on a pre-baked voice gguf via `voice` instead.
|
||||
refAudio []string
|
||||
}
|
||||
|
||||
// resolvePath joins a relative path onto `relTo`. The gallery
|
||||
@@ -169,6 +187,25 @@ func (v *VibevoiceCpp) parseOptions(opts []string, relTo string) string {
|
||||
return role
|
||||
}
|
||||
|
||||
// parseRefAudio splits a comma-separated audio_path value into a
|
||||
// resolved list of WAVs. The 1.5B model uses one WAV per speaker;
|
||||
// callers that only need a single reference set audio_path to a single
|
||||
// path. Empty / whitespace-only entries are skipped.
|
||||
func parseRefAudio(audioPath, relTo string) []string {
|
||||
if audioPath == "" {
|
||||
return nil
|
||||
}
|
||||
var out []string
|
||||
for _, p := range strings.Split(audioPath, ",") {
|
||||
p = strings.TrimSpace(p)
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, resolvePath(p, relTo))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (v *VibevoiceCpp) Load(opts *pb.ModelOptions) error {
|
||||
if opts.ModelFile == "" {
|
||||
return fmt.Errorf("vibevoice-cpp: ModelFile is required")
|
||||
@@ -189,6 +226,12 @@ func (v *VibevoiceCpp) Load(opts *pb.ModelOptions) error {
|
||||
}
|
||||
role := v.parseOptions(opts.Options, v.modelRoot)
|
||||
|
||||
// 1.5B reference WAVs ride on ModelOptions.AudioPath (config_file's
|
||||
// `audio_path:` key) — same convention other audio backends already
|
||||
// follow. Single-speaker = single path; multi-speaker = comma list,
|
||||
// one WAV per Speaker N: tag in TTSRequest.text.
|
||||
v.refAudio = parseRefAudio(opts.AudioPath, v.modelRoot)
|
||||
|
||||
// ModelFile fills the "primary" role-slot determined by `type=`
|
||||
// in Options (defaults to tts). The other slot stays exactly as
|
||||
// Options set it - so a closed-loop config with ModelFile=tts.gguf
|
||||
@@ -222,8 +265,8 @@ func (v *VibevoiceCpp) Load(opts *pb.ModelOptions) error {
|
||||
v.threads = threads
|
||||
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"[vibevoice-cpp] Loading: tts=%q asr=%q tokenizer=%q voice=%q threads=%d\n",
|
||||
v.ttsModel, v.asrModel, v.tokenizer, v.voice, threads)
|
||||
"[vibevoice-cpp] Loading: tts=%q asr=%q tokenizer=%q voice=%q ref_audio=%v threads=%d\n",
|
||||
v.ttsModel, v.asrModel, v.tokenizer, v.voice, v.refAudio, threads)
|
||||
|
||||
if rc := CppLoad(v.ttsModel, v.asrModel, v.tokenizer, v.voice, int32(threads)); rc != 0 {
|
||||
return fmt.Errorf("vibevoice-cpp: vv_capi_load failed (rc=%d)", rc)
|
||||
@@ -241,10 +284,35 @@ func (v *VibevoiceCpp) TTS(req *pb.TTSRequest) error {
|
||||
return fmt.Errorf("vibevoice-cpp: TTS requires both text and dst")
|
||||
}
|
||||
|
||||
// req.Voice may be a bare filename (e.g. "voice-en-Emma.gguf") or an
|
||||
// absolute path. Resolve via the same modelRoot Load() used for
|
||||
// Options[] so a swap-voice request mirrors the gallery's layout.
|
||||
voice := resolvePath(req.Voice, v.modelRoot)
|
||||
// TTSRequest.Voice carries the per-call override. Routing depends
|
||||
// on the loaded model variant:
|
||||
// * realtime-0.5B → expects a baked voice .gguf (single path).
|
||||
// * 1.5B → expects one or more raw 24 kHz mono .wav
|
||||
// reference clips for runtime voice cloning;
|
||||
// comma-separated to address multi-speaker
|
||||
// dialogs (Speaker 0..n-1 follow the order).
|
||||
// We pick the branch by extension / shape of the override; if no
|
||||
// override is given, fall back to the load-time defaults.
|
||||
voice := ""
|
||||
var refAudio []string
|
||||
if reqVoice := strings.TrimSpace(req.Voice); reqVoice != "" {
|
||||
if isRefAudioOverride(reqVoice) {
|
||||
for _, p := range strings.Split(reqVoice, ",") {
|
||||
p = strings.TrimSpace(p)
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
refAudio = append(refAudio, resolvePath(p, v.modelRoot))
|
||||
}
|
||||
} else {
|
||||
voice = resolvePath(reqVoice, v.modelRoot)
|
||||
}
|
||||
} else {
|
||||
// No per-call override. v.voice already went to vv_capi_load
|
||||
// for realtime-0.5B; ref_audio is per-call only on the C ABI,
|
||||
// so the gallery's `ref_audio:` defaults are re-passed here.
|
||||
refAudio = append(refAudio, v.refAudio...)
|
||||
}
|
||||
|
||||
if req.Language != nil && *req.Language != "" {
|
||||
fmt.Fprintf(os.Stderr,
|
||||
@@ -257,13 +325,51 @@ func (v *VibevoiceCpp) TTS(req *pb.TTSRequest) error {
|
||||
defaultMaxFrames = 200
|
||||
)
|
||||
defaultCfg := float32(1.3)
|
||||
if rc := CppTTS(text, voice, dst,
|
||||
int32(defaultSteps), defaultCfg, int32(defaultMaxFrames), 0); rc != 0 {
|
||||
|
||||
refPtrs, refKeep := newCStringArray(refAudio)
|
||||
rc := CppTTS(text, voice, refPtrs, int32(len(refPtrs)), dst,
|
||||
int32(defaultSteps), defaultCfg, int32(defaultMaxFrames), 0)
|
||||
// Hold the backing buffers past the cgo call. purego marshals
|
||||
// []*byte by handing the C side the underlying array address; the
|
||||
// pointed-to NUL-terminated bytes must outlive the call.
|
||||
runtime.KeepAlive(refKeep)
|
||||
runtime.KeepAlive(refPtrs)
|
||||
if rc != 0 {
|
||||
return fmt.Errorf("vibevoice-cpp: vv_capi_tts failed (rc=%d)", rc)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// isRefAudioOverride decides whether a TTSRequest.Voice override should
|
||||
// be routed to ref_audio_paths (1.5B path) instead of voice_path
|
||||
// (realtime-0.5B). Either a comma-separated list (multi-speaker) or a
|
||||
// single .wav clip qualifies; a bare voice .gguf falls through.
|
||||
func isRefAudioOverride(s string) bool {
|
||||
if strings.Contains(s, ",") {
|
||||
return true
|
||||
}
|
||||
return strings.HasSuffix(strings.ToLower(s), ".wav")
|
||||
}
|
||||
|
||||
// newCStringArray builds the **char array vv_capi_tts expects, plus the
|
||||
// keep-alive slice the caller must runtime.KeepAlive across the cgo
|
||||
// call. A nil/empty input returns (nil, nil) which purego marshals to
|
||||
// the C NULL pointer.
|
||||
func newCStringArray(in []string) ([]*byte, [][]byte) {
|
||||
if len(in) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
keep := make([][]byte, len(in))
|
||||
ptrs := make([]*byte, len(in))
|
||||
for i, s := range in {
|
||||
b := make([]byte, len(s)+1)
|
||||
copy(b, s)
|
||||
keep[i] = b
|
||||
ptrs[i] = &b[0]
|
||||
}
|
||||
return ptrs, keep
|
||||
}
|
||||
|
||||
// asrSegment matches vibevoice's JSON output:
|
||||
//
|
||||
// [{"Start":0.0,"End":2.8,"Speaker":0,"Content":"…"}, ...]
|
||||
|
||||
@@ -351,6 +351,30 @@ impl Backend for KokorosService {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn diarize(
|
||||
&self,
|
||||
_: Request<backend::DiarizeRequest>,
|
||||
) -> Result<Response<backend::DiarizeResponse>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn audio_transform(
|
||||
&self,
|
||||
_: Request<backend::AudioTransformRequest>,
|
||||
) -> Result<Response<backend::AudioTransformResult>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
type AudioTransformStreamStream =
|
||||
ReceiverStream<Result<backend::AudioTransformFrameResponse, Status>>;
|
||||
|
||||
async fn audio_transform_stream(
|
||||
&self,
|
||||
_: Request<tonic::Streaming<backend::AudioTransformFrameRequest>>,
|
||||
) -> Result<Response<Self::AudioTransformStreamStream>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn sound_generation(
|
||||
&self,
|
||||
_: Request<backend::SoundGenerationRequest>,
|
||||
|
||||
@@ -141,7 +141,7 @@ options:
|
||||
|
||||
## See also
|
||||
|
||||
- [Text to Audio (TTS)]({{< relref "tts.md" >}})
|
||||
- [Text to Audio (TTS)]({{< relref "text-to-audio.md" >}})
|
||||
- [Audio to Text]({{< relref "audio-to-text.md" >}})
|
||||
- [LocalVQE upstream](https://github.com/localai-org/LocalVQE)
|
||||
- [DeepVQE paper (Indenbom et al., Interspeech 2023)](https://arxiv.org/abs/2306.03177)
|
||||
|
||||
@@ -718,7 +718,7 @@ engine_args:
|
||||
`engine_args.data_parallel_size > 1` combined with the
|
||||
`local-ai p2p-worker vllm` follower lets a single model span multiple
|
||||
GPU nodes. See [vLLM Multi-Node (Data-Parallel)]({{% relref
|
||||
"/docs/features/distributed-mode#vllm-multi-node-data-parallel" %}})
|
||||
"features/distributed-mode#vllm-multi-node-data-parallel" %}})
|
||||
for the head/follower configuration and a worked Kimi-K2.6 example.
|
||||
|
||||
### Transformers
|
||||
|
||||
Reference in New Issue
Block a user