From 7dc691c171b23632938cb68ab031af6716014978 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 12 Mar 2026 07:48:23 +0100 Subject: [PATCH] feat: add fish-speech backend (#8962) * feat: add fish-speech backend Signed-off-by: Ettore Di Giacinto * drop portaudio Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 94 ++++ .github/workflows/test-extra.yml | 22 + Makefile | 8 +- backend/Dockerfile.python | 5 + backend/index.yaml | 118 +++++ backend/python/fish-speech/Makefile | 23 + backend/python/fish-speech/backend.py | 457 ++++++++++++++++++ backend/python/fish-speech/install.sh | 51 ++ backend/python/fish-speech/package.sh | 15 + .../python/fish-speech/requirements-cpu.txt | 3 + .../fish-speech/requirements-cublas12.txt | 3 + .../fish-speech/requirements-cublas13.txt | 3 + .../fish-speech/requirements-hipblas.txt | 3 + .../python/fish-speech/requirements-intel.txt | 3 + .../python/fish-speech/requirements-l4t12.txt | 3 + .../python/fish-speech/requirements-l4t13.txt | 3 + .../python/fish-speech/requirements-mps.txt | 2 + backend/python/fish-speech/requirements.txt | 9 + backend/python/fish-speech/run.sh | 9 + backend/python/fish-speech/test.py | 175 +++++++ backend/python/fish-speech/test.sh | 11 + gallery/index.yaml | 19 + 22 files changed, 1037 insertions(+), 2 deletions(-) create mode 100644 backend/python/fish-speech/Makefile create mode 100644 backend/python/fish-speech/backend.py create mode 100644 backend/python/fish-speech/install.sh create mode 100755 backend/python/fish-speech/package.sh create mode 100644 backend/python/fish-speech/requirements-cpu.txt create mode 100644 backend/python/fish-speech/requirements-cublas12.txt create mode 100644 backend/python/fish-speech/requirements-cublas13.txt create mode 100644 backend/python/fish-speech/requirements-hipblas.txt create mode 100644 backend/python/fish-speech/requirements-intel.txt create mode 100644 backend/python/fish-speech/requirements-l4t12.txt create mode 100644 backend/python/fish-speech/requirements-l4t13.txt create mode 100644 backend/python/fish-speech/requirements-mps.txt create mode 100644 backend/python/fish-speech/requirements.txt create mode 100644 backend/python/fish-speech/run.sh create mode 100644 backend/python/fish-speech/test.py create mode 100644 backend/python/fish-speech/test.sh diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index ee40ab3a1..030b02ef8 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -223,6 +223,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "8" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-fish-speech' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "fish-speech" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "8" @@ -614,6 +627,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-fish-speech' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "fish-speech" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -757,6 +783,19 @@ jobs: backend: "qwen-tts" dockerfile: "./backend/Dockerfile.python" context: "./" + - build-type: 'l4t' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-fish-speech' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + ubuntu-version: '2404' + backend: "fish-speech" + dockerfile: "./backend/Dockerfile.python" + context: "./" - build-type: 'l4t' cuda-major-version: "13" cuda-minor-version: "0" @@ -1201,6 +1240,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-fish-speech' + runs-on: 'arc-runner-set' + base-image: "rocm/dev-ubuntu-24.04:6.4.4" + skip-drivers: 'false' + backend: "fish-speech" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: "" @@ -1397,6 +1449,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2204' + - build-type: 'l4t' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-fish-speech' + runs-on: 'ubuntu-24.04-arm' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + skip-drivers: 'true' + backend: "fish-speech" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2204' - build-type: 'l4t' cuda-major-version: "12" cuda-minor-version: "0" @@ -1567,6 +1632,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'intel' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-fish-speech' + runs-on: 'arc-runner-set' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "fish-speech" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'intel' cuda-major-version: "" cuda-minor-version: "" @@ -2019,6 +2097,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-fish-speech' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "fish-speech" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: '' cuda-major-version: "" cuda-minor-version: "" @@ -2108,6 +2199,9 @@ jobs: - backend: "qwen-tts" tag-suffix: "-metal-darwin-arm64-qwen-tts" build-type: "mps" + - backend: "fish-speech" + tag-suffix: "-metal-darwin-arm64-fish-speech" + build-type: "mps" - backend: "voxcpm" tag-suffix: "-metal-darwin-arm64-voxcpm" build-type: "mps" diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index fee41fe7f..a254cafa5 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -304,6 +304,28 @@ jobs: run: | make --jobs=5 --output-sync=target -C backend/python/qwen-tts make --jobs=5 --output-sync=target -C backend/python/qwen-tts test + # TODO: s2-pro model is too large to load on CPU-only CI runners — re-enable + # when we have GPU runners or a smaller test model. + # tests-fish-speech: + # runs-on: ubuntu-latest + # timeout-minutes: 45 + # steps: + # - name: Clone + # uses: actions/checkout@v6 + # with: + # submodules: true + # - name: Dependencies + # run: | + # sudo apt-get update + # sudo apt-get install -y build-essential ffmpeg portaudio19-dev + # sudo apt-get install -y ca-certificates cmake curl patch python3-pip + # # Install UV + # curl -LsSf https://astral.sh/uv/install.sh | sh + # pip install --user --no-cache-dir grpcio-tools==1.64.1 + # - name: Test fish-speech + # run: | + # make --jobs=5 --output-sync=target -C backend/python/fish-speech + # make --jobs=5 --output-sync=target -C backend/python/fish-speech test tests-qwen-asr: runs-on: ubuntu-latest steps: diff --git a/Makefile b/Makefile index 15af1d39a..fe1f94fdd 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/voxtral +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/fish-speech backends/voxtral GOCMD=go GOTEST=$(GOCMD) test @@ -331,6 +331,7 @@ prepare-test-extra: protogen-python $(MAKE) -C backend/python/moonshine $(MAKE) -C backend/python/pocket-tts $(MAKE) -C backend/python/qwen-tts + $(MAKE) -C backend/python/fish-speech $(MAKE) -C backend/python/faster-qwen3-tts $(MAKE) -C backend/python/qwen-asr $(MAKE) -C backend/python/nemo @@ -349,6 +350,7 @@ test-extra: prepare-test-extra $(MAKE) -C backend/python/moonshine test $(MAKE) -C backend/python/pocket-tts test $(MAKE) -C backend/python/qwen-tts test + $(MAKE) -C backend/python/fish-speech test $(MAKE) -C backend/python/faster-qwen3-tts test $(MAKE) -C backend/python/qwen-asr test $(MAKE) -C backend/python/nemo test @@ -493,6 +495,7 @@ BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true BACKEND_MOONSHINE = moonshine|python|.|false|true BACKEND_POCKET_TTS = pocket-tts|python|.|false|true BACKEND_QWEN_TTS = qwen-tts|python|.|false|true +BACKEND_FISH_SPEECH = fish-speech|python|.|false|true BACKEND_FASTER_QWEN3_TTS = faster-qwen3-tts|python|.|false|true BACKEND_QWEN_ASR = qwen-asr|python|.|false|true BACKEND_NEMO = nemo|python|.|false|true @@ -547,6 +550,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE))) $(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE))) $(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS))) $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS))) +$(eval $(call generate-docker-build-target,$(BACKEND_FISH_SPEECH))) $(eval $(call generate-docker-build-target,$(BACKEND_FASTER_QWEN3_TTS))) $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_ASR))) $(eval $(call generate-docker-build-target,$(BACKEND_NEMO))) @@ -559,7 +563,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_MLX_DISTRIBUTED))) docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-voxtral docker-build-mlx-distributed +docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-voxtral docker-build-mlx-distributed ######################################################## ### Mock Backend for E2E Tests diff --git a/backend/Dockerfile.python b/backend/Dockerfile.python index 3067f670f..5d2e6171e 100644 --- a/backend/Dockerfile.python +++ b/backend/Dockerfile.python @@ -202,6 +202,11 @@ RUN mkdir -p /${BACKEND}/lib && \ TARGET_LIB_DIR="/${BACKEND}/lib" BUILD_TYPE="${BUILD_TYPE}" CUDA_MAJOR_VERSION="${CUDA_MAJOR_VERSION}" \ bash /package-gpu-libs.sh "/${BACKEND}/lib" +# Run backend-specific packaging if a package.sh exists +RUN if [ -f "/${BACKEND}/package.sh" ]; then \ + cd /${BACKEND} && bash package.sh; \ + fi + FROM scratch ARG BACKEND=rerankers COPY --from=builder /${BACKEND}/ / \ No newline at end of file diff --git a/backend/index.yaml b/backend/index.yaml index 392afa735..2271ad1b1 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -553,6 +553,30 @@ nvidia-l4t-cuda-12: "nvidia-l4t-qwen-tts" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen-tts" icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png +- &fish-speech + urls: + - https://github.com/fishaudio/fish-speech + description: | + Fish Speech is a high-quality text-to-speech model supporting voice cloning via reference audio. + tags: + - text-to-speech + - TTS + - voice-cloning + license: apache-2.0 + name: "fish-speech" + alias: "fish-speech" + capabilities: + nvidia: "cuda12-fish-speech" + intel: "intel-fish-speech" + amd: "rocm-fish-speech" + nvidia-l4t: "nvidia-l4t-fish-speech" + metal: "metal-fish-speech" + default: "cpu-fish-speech" + nvidia-cuda-13: "cuda13-fish-speech" + nvidia-cuda-12: "cuda12-fish-speech" + nvidia-l4t-cuda-12: "nvidia-l4t-fish-speech" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-fish-speech" + icon: https://avatars.githubusercontent.com/u/148526220?s=200&v=4 - &faster-qwen3-tts urls: - https://github.com/andimarafioti/faster-qwen3-tts @@ -2382,6 +2406,100 @@ uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-qwen-tts" mirrors: - localai/localai-backends:master-metal-darwin-arm64-qwen-tts +## fish-speech +- !!merge <<: *fish-speech + name: "fish-speech-development" + capabilities: + nvidia: "cuda12-fish-speech-development" + intel: "intel-fish-speech-development" + amd: "rocm-fish-speech-development" + nvidia-l4t: "nvidia-l4t-fish-speech-development" + metal: "metal-fish-speech-development" + default: "cpu-fish-speech-development" + nvidia-cuda-13: "cuda13-fish-speech-development" + nvidia-cuda-12: "cuda12-fish-speech-development" + nvidia-l4t-cuda-12: "nvidia-l4t-fish-speech-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-fish-speech-development" +- !!merge <<: *fish-speech + name: "cpu-fish-speech" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-fish-speech" + mirrors: + - localai/localai-backends:latest-cpu-fish-speech +- !!merge <<: *fish-speech + name: "cpu-fish-speech-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-fish-speech" + mirrors: + - localai/localai-backends:master-cpu-fish-speech +- !!merge <<: *fish-speech + name: "cuda12-fish-speech" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-fish-speech" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-fish-speech +- !!merge <<: *fish-speech + name: "cuda12-fish-speech-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-fish-speech" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-fish-speech +- !!merge <<: *fish-speech + name: "cuda13-fish-speech" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-fish-speech" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-fish-speech +- !!merge <<: *fish-speech + name: "cuda13-fish-speech-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-fish-speech" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-fish-speech +- !!merge <<: *fish-speech + name: "intel-fish-speech" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-fish-speech" + mirrors: + - localai/localai-backends:latest-gpu-intel-fish-speech +- !!merge <<: *fish-speech + name: "intel-fish-speech-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-fish-speech" + mirrors: + - localai/localai-backends:master-gpu-intel-fish-speech +- !!merge <<: *fish-speech + name: "rocm-fish-speech" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-fish-speech" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-fish-speech +- !!merge <<: *fish-speech + name: "rocm-fish-speech-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-fish-speech" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-fish-speech +- !!merge <<: *fish-speech + name: "nvidia-l4t-fish-speech" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-fish-speech" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-fish-speech +- !!merge <<: *fish-speech + name: "nvidia-l4t-fish-speech-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-fish-speech" + mirrors: + - localai/localai-backends:master-nvidia-l4t-fish-speech +- !!merge <<: *fish-speech + name: "cuda13-nvidia-l4t-arm64-fish-speech" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-fish-speech" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-fish-speech +- !!merge <<: *fish-speech + name: "cuda13-nvidia-l4t-arm64-fish-speech-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech +- !!merge <<: *fish-speech + name: "metal-fish-speech" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-fish-speech" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-fish-speech +- !!merge <<: *fish-speech + name: "metal-fish-speech-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-fish-speech" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-fish-speech ## faster-qwen3-tts - !!merge <<: *faster-qwen3-tts name: "faster-qwen3-tts-development" diff --git a/backend/python/fish-speech/Makefile b/backend/python/fish-speech/Makefile new file mode 100644 index 000000000..ace1ef3de --- /dev/null +++ b/backend/python/fish-speech/Makefile @@ -0,0 +1,23 @@ +.PHONY: fish-speech +fish-speech: + bash install.sh + +.PHONY: run +run: fish-speech + @echo "Running fish-speech..." + bash run.sh + @echo "fish-speech run." + +.PHONY: test +test: fish-speech + @echo "Testing fish-speech..." + bash test.sh + @echo "fish-speech tested." + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +.PHONY: clean +clean: protogen-clean + rm -rf venv __pycache__ diff --git a/backend/python/fish-speech/backend.py b/backend/python/fish-speech/backend.py new file mode 100644 index 000000000..921b71efc --- /dev/null +++ b/backend/python/fish-speech/backend.py @@ -0,0 +1,457 @@ +#!/usr/bin/env python3 +""" +This is an extra gRPC server of LocalAI for fish-speech TTS +""" + +from concurrent import futures +import time +import argparse +import signal +import sys +import os +import traceback +import backend_pb2 +import backend_pb2_grpc +import torch +import soundfile as sf +import numpy as np + +import json + +import grpc + + +def is_float(s): + """Check if a string can be converted to float.""" + try: + float(s) + return True + except ValueError: + return False + + +def is_int(s): + """Check if a string can be converted to int.""" + try: + int(s) + return True + except ValueError: + return False + + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get("PYTHON_GRPC_MAX_WORKERS", "1")) + + +# Implement the BackendServicer class with the service methods +class BackendServicer(backend_pb2_grpc.BackendServicer): + """ + BackendServicer is the class that implements the gRPC service + """ + + def Health(self, request, context): + return backend_pb2.Reply(message=bytes("OK", "utf-8")) + + def LoadModel(self, request, context): + try: + # Get device + if torch.cuda.is_available(): + print("CUDA is available", file=sys.stderr) + device = "cuda" + else: + print("CUDA is not available", file=sys.stderr) + device = "cpu" + mps_available = ( + hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + ) + if mps_available: + device = "mps" + if not torch.cuda.is_available() and request.CUDA: + return backend_pb2.Result(success=False, message="CUDA is not available") + + # Validate mps availability if requested + if device == "mps" and not torch.backends.mps.is_available(): + print("Warning: MPS not available. Falling back to CPU.", file=sys.stderr) + device = "cpu" + + self.device = device + self._torch_device = torch.device(device) + + options = request.Options + + # empty dict + self.options = {} + + # The options are a list of strings in this form optname:optvalue + for opt in options: + if ":" not in opt: + continue + key, value = opt.split(":", 1) + if is_float(value): + value = float(value) + elif is_int(value): + value = int(value) + elif value.lower() in ["true", "false"]: + value = value.lower() == "true" + self.options[key] = value + + # Parse voices configuration from options + self.voices = {} + if "voices" in self.options: + try: + voices_data = self.options["voices"] + if isinstance(voices_data, str): + voices_list = json.loads(voices_data) + else: + voices_list = voices_data + + for voice_entry in voices_list: + if not isinstance(voice_entry, dict): + print( + f"[WARNING] Invalid voice entry (not a dict): {voice_entry}", + file=sys.stderr, + ) + continue + + name = voice_entry.get("name") + audio = voice_entry.get("audio") + ref_text = voice_entry.get("ref_text", "") + + if not name or not isinstance(name, str): + print( + f"[WARNING] Voice entry missing required 'name' field: {voice_entry}", + file=sys.stderr, + ) + continue + if not audio or not isinstance(audio, str): + print( + f"[WARNING] Voice entry missing required 'audio' field: {voice_entry}", + file=sys.stderr, + ) + continue + + self.voices[name] = {"audio": audio, "ref_text": ref_text} + print( + f"[INFO] Registered voice '{name}' with audio: {audio}", + file=sys.stderr, + ) + + print(f"[INFO] Loaded {len(self.voices)} voice(s)", file=sys.stderr) + except json.JSONDecodeError as e: + print(f"[ERROR] Failed to parse voices JSON: {e}", file=sys.stderr) + except Exception as e: + print( + f"[ERROR] Error processing voices configuration: {e}", + file=sys.stderr, + ) + print(traceback.format_exc(), file=sys.stderr) + + # Store AudioPath, ModelFile, and ModelPath from LoadModel request + self.audio_path = ( + request.AudioPath + if hasattr(request, "AudioPath") and request.AudioPath + else None + ) + self.model_file = ( + request.ModelFile + if hasattr(request, "ModelFile") and request.ModelFile + else None + ) + self.model_path = ( + request.ModelPath + if hasattr(request, "ModelPath") and request.ModelPath + else None + ) + + # Get model path from request + model_path = request.Model + if not model_path: + model_path = "fishaudio/s2-pro" + + # If model_path looks like a HuggingFace repo ID (e.g. "fishaudio/fish-speech-1.5"), + # download it locally first since fish-speech expects a local directory + if "/" in model_path and not os.path.exists(model_path): + from huggingface_hub import snapshot_download + + print( + f"Downloading model from HuggingFace: {model_path}", + file=sys.stderr, + ) + model_path = snapshot_download(repo_id=model_path) + print(f"Model downloaded to: {model_path}", file=sys.stderr) + + # Determine precision + if device in ("mps", "cpu"): + precision = torch.float32 + else: + precision = torch.bfloat16 + + # Whether to use torch.compile + compile_model = self.options.get("compile", False) + + print( + f"Using device: {device}, precision: {precision}, compile: {compile_model}", + file=sys.stderr, + ) + print(f"Loading model from: {model_path}", file=sys.stderr) + + # Import fish-speech modules + from fish_speech.inference_engine import TTSInferenceEngine + from fish_speech.models.dac.inference import load_model as load_decoder_model + from fish_speech.models.text2semantic.inference import ( + launch_thread_safe_queue, + ) + + # Determine decoder checkpoint path + # The codec model is typically at /codec.pth + decoder_checkpoint = self.options.get("decoder_checkpoint", None) + if not decoder_checkpoint: + # Try common locations + if os.path.isdir(model_path): + candidate = os.path.join(model_path, "codec.pth") + if os.path.exists(candidate): + decoder_checkpoint = candidate + + # Launch LLaMA queue (runs in daemon thread) + print("Launching LLaMA queue...", file=sys.stderr) + llama_queue = launch_thread_safe_queue( + checkpoint_path=model_path, + device=device, + precision=precision, + compile=compile_model, + ) + + # Load DAC decoder + decoder_config = self.options.get("decoder_config", "modded_dac_vq") + if not decoder_checkpoint: + return backend_pb2.Result( + success=False, + message="Decoder checkpoint (codec.pth) not found. " + "Ensure the model directory contains codec.pth or set " + "decoder_checkpoint option.", + ) + print( + f"Loading DAC decoder (config={decoder_config}, checkpoint={decoder_checkpoint})...", + file=sys.stderr, + ) + decoder_model = load_decoder_model( + config_name=decoder_config, + checkpoint_path=decoder_checkpoint, + device=device, + ) + + # Create TTS inference engine + self.engine = TTSInferenceEngine( + llama_queue=llama_queue, + decoder_model=decoder_model, + precision=precision, + compile=compile_model, + ) + + print(f"Model loaded successfully: {model_path}", file=sys.stderr) + + return backend_pb2.Result(message="Model loaded successfully", success=True) + + except Exception as e: + print(f"[ERROR] Loading model: {type(e).__name__}: {e}", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + return backend_pb2.Result( + success=False, message=f"Failed to load model: {e}" + ) + + def _get_ref_audio_path(self, voice_name=None): + """Get reference audio path from voices dict or stored AudioPath.""" + if voice_name and voice_name in self.voices: + audio_path = self.voices[voice_name]["audio"] + + if os.path.isabs(audio_path): + return audio_path + + # Try relative to ModelFile + if self.model_file: + model_file_base = os.path.dirname(self.model_file) + ref_path = os.path.join(model_file_base, audio_path) + if os.path.exists(ref_path): + return ref_path + + # Try relative to ModelPath + if self.model_path: + ref_path = os.path.join(self.model_path, audio_path) + if os.path.exists(ref_path): + return ref_path + + return audio_path + + # Fall back to legacy single-voice mode + if not self.audio_path: + return None + + if os.path.isabs(self.audio_path): + return self.audio_path + + if self.model_file: + model_file_base = os.path.dirname(self.model_file) + ref_path = os.path.join(model_file_base, self.audio_path) + if os.path.exists(ref_path): + return ref_path + + if self.model_path: + ref_path = os.path.join(self.model_path, self.audio_path) + if os.path.exists(ref_path): + return ref_path + + return self.audio_path + + def TTS(self, request, context): + try: + from fish_speech.utils.schema import ServeTTSRequest, ServeReferenceAudio + + if not request.dst: + return backend_pb2.Result( + success=False, message="dst (output path) is required" + ) + + text = request.text.strip() + if not text: + return backend_pb2.Result(success=False, message="Text is empty") + + # Get generation parameters from options + top_p = self.options.get("top_p", 0.8) + temperature = self.options.get("temperature", 0.8) + repetition_penalty = self.options.get("repetition_penalty", 1.1) + max_new_tokens = self.options.get("max_new_tokens", 1024) + chunk_length = self.options.get("chunk_length", 200) + + # Build references list for voice cloning + references = [] + voice_name = request.voice if request.voice else None + + if voice_name and voice_name in self.voices: + ref_audio_path = self._get_ref_audio_path(voice_name) + if ref_audio_path and os.path.exists(ref_audio_path): + with open(ref_audio_path, "rb") as f: + audio_bytes = f.read() + ref_text = self.voices[voice_name].get("ref_text", "") + references.append( + ServeReferenceAudio(audio=audio_bytes, text=ref_text) + ) + print( + f"[INFO] Using voice '{voice_name}' with reference audio: {ref_audio_path}", + file=sys.stderr, + ) + elif self.audio_path: + ref_audio_path = self._get_ref_audio_path() + if ref_audio_path and os.path.exists(ref_audio_path): + with open(ref_audio_path, "rb") as f: + audio_bytes = f.read() + ref_text = self.options.get("ref_text", "") + references.append( + ServeReferenceAudio(audio=audio_bytes, text=ref_text) + ) + print( + f"[INFO] Using reference audio: {ref_audio_path}", + file=sys.stderr, + ) + + # Build ServeTTSRequest + tts_request = ServeTTSRequest( + text=text, + references=references, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + max_new_tokens=max_new_tokens, + chunk_length=chunk_length, + ) + + # Run inference + print(f"Generating speech for text: {text[:100]}...", file=sys.stderr) + start_time = time.time() + + sample_rate = None + audio_data = None + + for result in self.engine.inference(tts_request): + if result.code == "final": + sample_rate, audio_data = result.audio + elif result.code == "error": + error_msg = str(result.error) if result.error else "Unknown error" + print(f"[ERROR] TTS inference error: {error_msg}", file=sys.stderr) + return backend_pb2.Result( + success=False, message=f"TTS inference error: {error_msg}" + ) + + generation_duration = time.time() - start_time + + if audio_data is None or sample_rate is None: + return backend_pb2.Result( + success=False, message="No audio output generated" + ) + + # Ensure audio_data is a numpy array + if not isinstance(audio_data, np.ndarray): + audio_data = np.array(audio_data) + + audio_duration = len(audio_data) / sample_rate if sample_rate > 0 else 0 + print( + f"[INFO] TTS generation completed: {generation_duration:.2f}s, " + f"audio_duration={audio_duration:.2f}s, sample_rate={sample_rate}", + file=sys.stderr, + flush=True, + ) + + # Save output + sf.write(request.dst, audio_data, sample_rate) + print(f"Saved {audio_duration:.2f}s audio to {request.dst}", file=sys.stderr) + + except Exception as err: + print(f"Error in TTS: {err}", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + return backend_pb2.Result( + success=False, message=f"Unexpected {err=}, {type(err)=}" + ) + + return backend_pb2.Result(success=True) + + +def serve(address): + server = grpc.server( + futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ("grpc.max_message_length", 50 * 1024 * 1024), # 50MB + ("grpc.max_send_message_length", 50 * 1024 * 1024), # 50MB + ("grpc.max_receive_message_length", 50 * 1024 * 1024), # 50MB + ], + ) + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + server.add_insecure_port(address) + server.start() + print("Server started. Listening on: " + address, file=sys.stderr) + + # Define the signal handler function + def signal_handler(sig, frame): + print("Received termination signal. Shutting down...") + server.stop(0) + sys.exit(0) + + # Set the signal handlers for SIGINT and SIGTERM + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + try: + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + server.stop(0) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the gRPC server.") + parser.add_argument( + "--addr", default="localhost:50051", help="The address to bind the server to." + ) + args = parser.parse_args() + + serve(args.addr) diff --git a/backend/python/fish-speech/install.sh b/backend/python/fish-speech/install.sh new file mode 100644 index 000000000..6e1ab8c95 --- /dev/null +++ b/backend/python/fish-speech/install.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -e + +EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation" + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +# fish-speech uses pyrootutils which requires a .project-root marker +touch "${backend_dir}/.project-root" + +installRequirements + +# Clone fish-speech source (the pip package doesn't include inference modules) +FISH_SPEECH_DIR="${EDIR}/fish-speech-src" +FISH_SPEECH_REPO="https://github.com/fishaudio/fish-speech.git" +FISH_SPEECH_BRANCH="main" + +if [ ! -d "${FISH_SPEECH_DIR}" ]; then + echo "Cloning fish-speech source..." + git clone --depth 1 --branch "${FISH_SPEECH_BRANCH}" "${FISH_SPEECH_REPO}" "${FISH_SPEECH_DIR}" +else + echo "Updating fish-speech source..." + cd "${FISH_SPEECH_DIR}" && git pull && cd - +fi + +# Remove pyaudio from fish-speech deps — it's only used by the upstream client tool +# (tools/api_client.py) for speaker playback, not by our gRPC backend server. +# It requires native portaudio libs which aren't available on all build environments. +sed -i.bak '/"pyaudio"/d' "${FISH_SPEECH_DIR}/pyproject.toml" + +# Install fish-speech deps from source (without the package itself since we use PYTHONPATH) +ensureVenv +if [ "x${USE_PIP}" == "xtrue" ]; then + pip install ${EXTRA_PIP_INSTALL_FLAGS:-} -e "${FISH_SPEECH_DIR}" +else + uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} -e "${FISH_SPEECH_DIR}" +fi + +# fish-speech transitive deps (wandb, tensorboard) may downgrade protobuf to 3.x +# but our generated backend_pb2.py requires protobuf 5+ +ensureVenv +if [ "x${USE_PIP}" == "xtrue" ]; then + pip install "protobuf>=5.29.0" +else + uv pip install "protobuf>=5.29.0" +fi diff --git a/backend/python/fish-speech/package.sh b/backend/python/fish-speech/package.sh new file mode 100755 index 000000000..afade34a6 --- /dev/null +++ b/backend/python/fish-speech/package.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Script to package runtime libraries for the fish-speech backend +# This is needed because the final Docker image is FROM scratch, +# so system libraries must be explicitly included. + +set -e + +CURDIR=$(dirname "$(realpath $0)") + +# Create lib directory +mkdir -p $CURDIR/lib + +echo "fish-speech packaging completed successfully" +ls -liah $CURDIR/lib/ diff --git a/backend/python/fish-speech/requirements-cpu.txt b/backend/python/fish-speech/requirements-cpu.txt new file mode 100644 index 000000000..5c213d676 --- /dev/null +++ b/backend/python/fish-speech/requirements-cpu.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +torch +torchaudio diff --git a/backend/python/fish-speech/requirements-cublas12.txt b/backend/python/fish-speech/requirements-cublas12.txt new file mode 100644 index 000000000..5d66535c7 --- /dev/null +++ b/backend/python/fish-speech/requirements-cublas12.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cu121 +torch +torchaudio diff --git a/backend/python/fish-speech/requirements-cublas13.txt b/backend/python/fish-speech/requirements-cublas13.txt new file mode 100644 index 000000000..c367ab45c --- /dev/null +++ b/backend/python/fish-speech/requirements-cublas13.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +torch +torchaudio diff --git a/backend/python/fish-speech/requirements-hipblas.txt b/backend/python/fish-speech/requirements-hipblas.txt new file mode 100644 index 000000000..81a30d412 --- /dev/null +++ b/backend/python/fish-speech/requirements-hipblas.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/rocm6.3 +torch==2.7.1+rocm6.3 +torchaudio==2.7.1+rocm6.3 diff --git a/backend/python/fish-speech/requirements-intel.txt b/backend/python/fish-speech/requirements-intel.txt new file mode 100644 index 000000000..15509ba77 --- /dev/null +++ b/backend/python/fish-speech/requirements-intel.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/xpu +torch +torchaudio diff --git a/backend/python/fish-speech/requirements-l4t12.txt b/backend/python/fish-speech/requirements-l4t12.txt new file mode 100644 index 000000000..36fb96068 --- /dev/null +++ b/backend/python/fish-speech/requirements-l4t12.txt @@ -0,0 +1,3 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/ +torch +torchaudio diff --git a/backend/python/fish-speech/requirements-l4t13.txt b/backend/python/fish-speech/requirements-l4t13.txt new file mode 100644 index 000000000..c367ab45c --- /dev/null +++ b/backend/python/fish-speech/requirements-l4t13.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +torch +torchaudio diff --git a/backend/python/fish-speech/requirements-mps.txt b/backend/python/fish-speech/requirements-mps.txt new file mode 100644 index 000000000..ff5c00f19 --- /dev/null +++ b/backend/python/fish-speech/requirements-mps.txt @@ -0,0 +1,2 @@ +torch +torchaudio diff --git a/backend/python/fish-speech/requirements.txt b/backend/python/fish-speech/requirements.txt new file mode 100644 index 000000000..1be3c8250 --- /dev/null +++ b/backend/python/fish-speech/requirements.txt @@ -0,0 +1,9 @@ +grpcio==1.71.0 +protobuf +certifi +packaging==24.1 +soundfile +setuptools +six +scipy +numpy diff --git a/backend/python/fish-speech/run.sh b/backend/python/fish-speech/run.sh new file mode 100644 index 000000000..eae121f37 --- /dev/null +++ b/backend/python/fish-speech/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +startBackend $@ diff --git a/backend/python/fish-speech/test.py b/backend/python/fish-speech/test.py new file mode 100644 index 000000000..0831c6a1b --- /dev/null +++ b/backend/python/fish-speech/test.py @@ -0,0 +1,175 @@ +""" +A test script to test the gRPC service +""" +import signal +import threading +import unittest +import subprocess +import time +import os +import sys +import tempfile +import backend_pb2 +import backend_pb2_grpc + +import grpc + +BACKEND_LOG = "/tmp/fish-speech-backend.log" + + +def _dump_backend_log(): + """Print backend log — call before exiting so CI always shows it.""" + if os.path.exists(BACKEND_LOG): + with open(BACKEND_LOG, "r") as f: + contents = f.read() + if contents: + print("=== Backend Log ===", file=sys.stderr, flush=True) + print(contents, file=sys.stderr, flush=True) + + +def _sigterm_handler(signum, frame): + """Handle SIGTERM so the backend log is printed before exit.""" + print(f"\nReceived signal {signum}, dumping backend log before exit...", + file=sys.stderr, flush=True) + _dump_backend_log() + sys.exit(143) + + +signal.signal(signal.SIGTERM, _sigterm_handler) + + +def _tail_log(path, stop_event, interval=10): + """Background thread that periodically prints new lines from the backend log.""" + pos = 0 + while not stop_event.is_set(): + stop_event.wait(interval) + try: + with open(path, "r") as f: + f.seek(pos) + new = f.read() + if new: + print(f"[backend log] {new}", file=sys.stderr, end="", flush=True) + pos = f.tell() + except FileNotFoundError: + pass + + +class TestBackendServicer(unittest.TestCase): + """ + TestBackendServicer is the class that tests the gRPC service + """ + def setUp(self): + """ + This method sets up the gRPC service by starting the server + """ + print("Starting backend server...", file=sys.stderr, flush=True) + self.backend_log = open(BACKEND_LOG, "w") + self.service = subprocess.Popen( + ["python3", "backend.py", "--addr", "localhost:50051"], + stdout=self.backend_log, + stderr=self.backend_log, + ) + + # Start tailing backend log so CI sees progress in real time + self._log_stop = threading.Event() + self._log_thread = threading.Thread( + target=_tail_log, args=(BACKEND_LOG, self._log_stop), daemon=True + ) + self._log_thread.start() + + # Poll for readiness instead of a fixed sleep + print("Waiting for backend to be ready...", file=sys.stderr, flush=True) + max_wait = 60 + start = time.time() + ready = False + while time.time() - start < max_wait: + try: + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + resp = stub.Health(backend_pb2.HealthMessage(), timeout=2.0) + if resp.message: + ready = True + break + except Exception: + pass + # Check if process died + if self.service.poll() is not None: + self.fail(f"Backend process exited early with code {self.service.returncode}") + time.sleep(2) + + elapsed = time.time() - start + if not ready: + self.fail(f"Backend not ready after {max_wait}s") + print(f"Backend ready after {elapsed:.1f}s", file=sys.stderr, flush=True) + + def tearDown(self) -> None: + """ + This method tears down the gRPC service by terminating the server + """ + self._log_stop.set() + self._log_thread.join(timeout=2) + self.service.terminate() + try: + self.service.wait(timeout=5) + except subprocess.TimeoutExpired: + self.service.kill() + self.service.wait() + self.backend_log.close() + _dump_backend_log() + + def test_tts(self): + """ + This method tests if the TTS generation works successfully + """ + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + # Limit max_new_tokens for CPU testing (generation is very slow on CPU) + print("Loading model fishaudio/s2-pro...", file=sys.stderr, flush=True) + load_start = time.time() + response = stub.LoadModel( + backend_pb2.ModelOptions( + Model="fishaudio/s2-pro", + Options=["max_new_tokens:50"], + ), + timeout=1800.0 + ) + print( + f"LoadModel response: success={response.success}, " + f"message={response.message}, " + f"took {time.time() - load_start:.1f}s", + file=sys.stderr, flush=True + ) + self.assertTrue(response.success, f"LoadModel failed: {response.message}") + + # Create temporary output file + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: + output_path = tmp_file.name + + tts_request = backend_pb2.TTSRequest( + text="Hi.", + dst=output_path + ) + # Allow up to 10 minutes for TTS generation on CPU + print("Starting TTS generation...", file=sys.stderr, flush=True) + tts_start = time.time() + tts_response = stub.TTS(tts_request, timeout=600.0) + print( + f"TTS response: success={tts_response.success}, " + f"took {time.time() - tts_start:.1f}s", + file=sys.stderr, flush=True + ) + self.assertIsNotNone(tts_response) + self.assertTrue(tts_response.success) + + # Verify output file exists and is not empty + self.assertTrue(os.path.exists(output_path)) + file_size = os.path.getsize(output_path) + print(f"Output file size: {file_size} bytes", file=sys.stderr, flush=True) + self.assertGreater(file_size, 0) + + # Cleanup + os.unlink(output_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/python/fish-speech/test.sh b/backend/python/fish-speech/test.sh new file mode 100644 index 000000000..eb59f2aaf --- /dev/null +++ b/backend/python/fish-speech/test.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +runUnittests diff --git a/gallery/index.yaml b/gallery/index.yaml index 1b09fab7c..860c23e6c 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -958,6 +958,25 @@ voice: Aiden # Available speakers: Vivian, Serena, Uncle_Fu, Dylan, Eric, Ryan, Aiden, Ono_Anna, Sohee parameters: model: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice +- &fish-speech + urls: + - https://huggingface.co/fishaudio/s2-pro + description: | + Fish Speech S2-Pro is a high-quality text-to-speech model supporting voice cloning via reference audio. Uses a two-stage pipeline: text to semantic tokens (LLaMA-based) then semantic to audio (DAC decoder). + tags: + - text-to-speech + - TTS + - voice-cloning + license: apache-2.0 + icon: https://huggingface.co/fishaudio/s2-pro/resolve/main/overview.png + name: "fish-speech-s2-pro" + url: "github:mudler/LocalAI/gallery/virtual.yaml@master" + overrides: + backend: fish-speech + known_usecases: + - tts + parameters: + model: fishaudio/s2-pro - &qwen-asr urls: - https://huggingface.co/Qwen/Qwen3-ASR-1.7B