From 7dc691c171b23632938cb68ab031af6716014978 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 12 Mar 2026 07:48:23 +0100
Subject: [PATCH] feat: add fish-speech backend (#8962)

* feat: add fish-speech backend

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* drop portaudio

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/backend.yml                 |  94 ++++
 .github/workflows/test-extra.yml              |  22 +
 Makefile                                      |   8 +-
 backend/Dockerfile.python                     |   5 +
 backend/index.yaml                            | 118 +++++
 backend/python/fish-speech/Makefile           |  23 +
 backend/python/fish-speech/backend.py         | 457 ++++++++++++++++++
 backend/python/fish-speech/install.sh         |  51 ++
 backend/python/fish-speech/package.sh         |  15 +
 .../python/fish-speech/requirements-cpu.txt   |   3 +
 .../fish-speech/requirements-cublas12.txt     |   3 +
 .../fish-speech/requirements-cublas13.txt     |   3 +
 .../fish-speech/requirements-hipblas.txt      |   3 +
 .../python/fish-speech/requirements-intel.txt |   3 +
 .../python/fish-speech/requirements-l4t12.txt |   3 +
 .../python/fish-speech/requirements-l4t13.txt |   3 +
 .../python/fish-speech/requirements-mps.txt   |   2 +
 backend/python/fish-speech/requirements.txt   |   9 +
 backend/python/fish-speech/run.sh             |   9 +
 backend/python/fish-speech/test.py            | 175 +++++++
 backend/python/fish-speech/test.sh            |  11 +
 gallery/index.yaml                            |  19 +
 22 files changed, 1037 insertions(+), 2 deletions(-)
 create mode 100644 backend/python/fish-speech/Makefile
 create mode 100644 backend/python/fish-speech/backend.py
 create mode 100644 backend/python/fish-speech/install.sh
 create mode 100755 backend/python/fish-speech/package.sh
 create mode 100644 backend/python/fish-speech/requirements-cpu.txt
 create mode 100644 backend/python/fish-speech/requirements-cublas12.txt
 create mode 100644 backend/python/fish-speech/requirements-cublas13.txt
 create mode 100644 backend/python/fish-speech/requirements-hipblas.txt
 create mode 100644 backend/python/fish-speech/requirements-intel.txt
 create mode 100644 backend/python/fish-speech/requirements-l4t12.txt
 create mode 100644 backend/python/fish-speech/requirements-l4t13.txt
 create mode 100644 backend/python/fish-speech/requirements-mps.txt
 create mode 100644 backend/python/fish-speech/requirements.txt
 create mode 100644 backend/python/fish-speech/run.sh
 create mode 100644 backend/python/fish-speech/test.py
 create mode 100644 backend/python/fish-speech/test.sh

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index ee40ab3a1..030b02ef8 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -223,6 +223,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.python"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "8"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-12-fish-speech'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "fish-speech"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: 'cublas'
             cuda-major-version: "12"
             cuda-minor-version: "8"
@@ -614,6 +627,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.python"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-fish-speech'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "fish-speech"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: 'cublas'
             cuda-major-version: "13"
             cuda-minor-version: "0"
@@ -757,6 +783,19 @@ jobs:
             backend: "qwen-tts"
             dockerfile: "./backend/Dockerfile.python"
             context: "./"
+          - build-type: 'l4t'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-cuda-13-arm64-fish-speech'
+            runs-on: 'ubuntu-24.04-arm'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            ubuntu-version: '2404'
+            backend: "fish-speech"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
           - build-type: 'l4t'
             cuda-major-version: "13"
             cuda-minor-version: "0"
@@ -1201,6 +1240,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.python"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-rocm-hipblas-fish-speech'
+            runs-on: 'arc-runner-set'
+            base-image: "rocm/dev-ubuntu-24.04:6.4.4"
+            skip-drivers: 'false'
+            backend: "fish-speech"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: 'hipblas'
             cuda-major-version: ""
             cuda-minor-version: ""
@@ -1397,6 +1449,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.python"
             context: "./"
             ubuntu-version: '2204'
+          - build-type: 'l4t'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-fish-speech'
+            runs-on: 'ubuntu-24.04-arm'
+            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+            skip-drivers: 'true'
+            backend: "fish-speech"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2204'
           - build-type: 'l4t'
             cuda-major-version: "12"
             cuda-minor-version: "0"
@@ -1567,6 +1632,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.python"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: 'intel'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-intel-fish-speech'
+            runs-on: 'arc-runner-set'
+            base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+            skip-drivers: 'false'
+            backend: "fish-speech"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: 'intel'
             cuda-major-version: ""
             cuda-minor-version: ""
@@ -2019,6 +2097,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.python"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-cpu-fish-speech'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "fish-speech"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: ''
             cuda-major-version: ""
             cuda-minor-version: ""
@@ -2108,6 +2199,9 @@ jobs:
           - backend: "qwen-tts"
             tag-suffix: "-metal-darwin-arm64-qwen-tts"
             build-type: "mps"
+          - backend: "fish-speech"
+            tag-suffix: "-metal-darwin-arm64-fish-speech"
+            build-type: "mps"
           - backend: "voxcpm"
             tag-suffix: "-metal-darwin-arm64-voxcpm"
             build-type: "mps"
diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml
index fee41fe7f..a254cafa5 100644
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -304,6 +304,28 @@ jobs:
         run: |
           make --jobs=5 --output-sync=target -C backend/python/qwen-tts
           make --jobs=5 --output-sync=target -C backend/python/qwen-tts test
+  # TODO: s2-pro model is too large to load on CPU-only CI runners — re-enable
+  # when we have GPU runners or a smaller test model.
+  # tests-fish-speech:
+  #   runs-on: ubuntu-latest
+  #   timeout-minutes: 45
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v6
+  #       with:
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install -y build-essential ffmpeg portaudio19-dev
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
+  #     - name: Test fish-speech
+  #       run: |
+  #         make --jobs=5 --output-sync=target -C backend/python/fish-speech
+  #         make --jobs=5 --output-sync=target -C backend/python/fish-speech test
   tests-qwen-asr:
     runs-on: ubuntu-latest
     steps:
diff --git a/Makefile b/Makefile
index 15af1d39a..fe1f94fdd 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Disable parallel execution for backend builds
-.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/voxtral
+.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/fish-speech backends/voxtral
 
 GOCMD=go
 GOTEST=$(GOCMD) test
@@ -331,6 +331,7 @@ prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/moonshine
 	$(MAKE) -C backend/python/pocket-tts
 	$(MAKE) -C backend/python/qwen-tts
+	$(MAKE) -C backend/python/fish-speech
 	$(MAKE) -C backend/python/faster-qwen3-tts
 	$(MAKE) -C backend/python/qwen-asr
 	$(MAKE) -C backend/python/nemo
@@ -349,6 +350,7 @@ test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/moonshine test
 	$(MAKE) -C backend/python/pocket-tts test
 	$(MAKE) -C backend/python/qwen-tts test
+	$(MAKE) -C backend/python/fish-speech test
 	$(MAKE) -C backend/python/faster-qwen3-tts test
 	$(MAKE) -C backend/python/qwen-asr test
 	$(MAKE) -C backend/python/nemo test
@@ -493,6 +495,7 @@ BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true
 BACKEND_MOONSHINE = moonshine|python|.|false|true
 BACKEND_POCKET_TTS = pocket-tts|python|.|false|true
 BACKEND_QWEN_TTS = qwen-tts|python|.|false|true
+BACKEND_FISH_SPEECH = fish-speech|python|.|false|true
 BACKEND_FASTER_QWEN3_TTS = faster-qwen3-tts|python|.|false|true
 BACKEND_QWEN_ASR = qwen-asr|python|.|false|true
 BACKEND_NEMO = nemo|python|.|false|true
@@ -547,6 +550,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS)))
 $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS)))
+$(eval $(call generate-docker-build-target,$(BACKEND_FISH_SPEECH)))
 $(eval $(call generate-docker-build-target,$(BACKEND_FASTER_QWEN3_TTS)))
 $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_ASR)))
 $(eval $(call generate-docker-build-target,$(BACKEND_NEMO)))
@@ -559,7 +563,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_MLX_DISTRIBUTED)))
 docker-save-%: backend-images
 	docker save local-ai-backend:$* -o backend-images/$*.tar
 
-docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-voxtral docker-build-mlx-distributed
+docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-voxtral docker-build-mlx-distributed
 
 ########################################################
 ### Mock Backend for E2E Tests
diff --git a/backend/Dockerfile.python b/backend/Dockerfile.python
index 3067f670f..5d2e6171e 100644
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -202,6 +202,11 @@ RUN mkdir -p /${BACKEND}/lib && \
     TARGET_LIB_DIR="/${BACKEND}/lib" BUILD_TYPE="${BUILD_TYPE}" CUDA_MAJOR_VERSION="${CUDA_MAJOR_VERSION}" \
     bash /package-gpu-libs.sh "/${BACKEND}/lib"
 
+# Run backend-specific packaging if a package.sh exists
+RUN if [ -f "/${BACKEND}/package.sh" ]; then \
+        cd /${BACKEND} && bash package.sh; \
+    fi
+
 FROM scratch
 ARG BACKEND=rerankers
 COPY --from=builder /${BACKEND}/ /
\ No newline at end of file
diff --git a/backend/index.yaml b/backend/index.yaml
index 392afa735..2271ad1b1 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -553,6 +553,30 @@
     nvidia-l4t-cuda-12: "nvidia-l4t-qwen-tts"
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen-tts"
   icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png
+- &fish-speech
+  urls:
+    - https://github.com/fishaudio/fish-speech
+  description: |
+    Fish Speech is a high-quality text-to-speech model supporting voice cloning via reference audio.
+  tags:
+    - text-to-speech
+    - TTS
+    - voice-cloning
+  license: apache-2.0
+  name: "fish-speech"
+  alias: "fish-speech"
+  capabilities:
+    nvidia: "cuda12-fish-speech"
+    intel: "intel-fish-speech"
+    amd: "rocm-fish-speech"
+    nvidia-l4t: "nvidia-l4t-fish-speech"
+    metal: "metal-fish-speech"
+    default: "cpu-fish-speech"
+    nvidia-cuda-13: "cuda13-fish-speech"
+    nvidia-cuda-12: "cuda12-fish-speech"
+    nvidia-l4t-cuda-12: "nvidia-l4t-fish-speech"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-fish-speech"
+  icon: https://avatars.githubusercontent.com/u/148526220?s=200&v=4
 - &faster-qwen3-tts
   urls:
     - https://github.com/andimarafioti/faster-qwen3-tts
@@ -2382,6 +2406,100 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-qwen-tts"
   mirrors:
     - localai/localai-backends:master-metal-darwin-arm64-qwen-tts
+## fish-speech
+- !!merge <<: *fish-speech
+  name: "fish-speech-development"
+  capabilities:
+    nvidia: "cuda12-fish-speech-development"
+    intel: "intel-fish-speech-development"
+    amd: "rocm-fish-speech-development"
+    nvidia-l4t: "nvidia-l4t-fish-speech-development"
+    metal: "metal-fish-speech-development"
+    default: "cpu-fish-speech-development"
+    nvidia-cuda-13: "cuda13-fish-speech-development"
+    nvidia-cuda-12: "cuda12-fish-speech-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-fish-speech-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-fish-speech-development"
+- !!merge <<: *fish-speech
+  name: "cpu-fish-speech"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-fish-speech"
+  mirrors:
+    - localai/localai-backends:latest-cpu-fish-speech
+- !!merge <<: *fish-speech
+  name: "cpu-fish-speech-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-fish-speech"
+  mirrors:
+    - localai/localai-backends:master-cpu-fish-speech
+- !!merge <<: *fish-speech
+  name: "cuda12-fish-speech"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-fish-speech"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-fish-speech
+- !!merge <<: *fish-speech
+  name: "cuda12-fish-speech-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-fish-speech"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-fish-speech
+- !!merge <<: *fish-speech
+  name: "cuda13-fish-speech"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-fish-speech"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-fish-speech
+- !!merge <<: *fish-speech
+  name: "cuda13-fish-speech-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-fish-speech"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-fish-speech
+- !!merge <<: *fish-speech
+  name: "intel-fish-speech"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-fish-speech"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-fish-speech
+- !!merge <<: *fish-speech
+  name: "intel-fish-speech-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-fish-speech"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-fish-speech
+- !!merge <<: *fish-speech
+  name: "rocm-fish-speech"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-fish-speech"
+  mirrors:
+    - localai/localai-backends:latest-gpu-rocm-hipblas-fish-speech
+- !!merge <<: *fish-speech
+  name: "rocm-fish-speech-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-fish-speech"
+  mirrors:
+    - localai/localai-backends:master-gpu-rocm-hipblas-fish-speech
+- !!merge <<: *fish-speech
+  name: "nvidia-l4t-fish-speech"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-fish-speech"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-fish-speech
+- !!merge <<: *fish-speech
+  name: "nvidia-l4t-fish-speech-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-fish-speech"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-fish-speech
+- !!merge <<: *fish-speech
+  name: "cuda13-nvidia-l4t-arm64-fish-speech"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-fish-speech"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-fish-speech
+- !!merge <<: *fish-speech
+  name: "cuda13-nvidia-l4t-arm64-fish-speech-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech
+- !!merge <<: *fish-speech
+  name: "metal-fish-speech"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-fish-speech"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-fish-speech
+- !!merge <<: *fish-speech
+  name: "metal-fish-speech-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-fish-speech"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-fish-speech
 ## faster-qwen3-tts
 - !!merge <<: *faster-qwen3-tts
   name: "faster-qwen3-tts-development"
diff --git a/backend/python/fish-speech/Makefile b/backend/python/fish-speech/Makefile
new file mode 100644
index 000000000..ace1ef3de
--- /dev/null
+++ b/backend/python/fish-speech/Makefile
@@ -0,0 +1,23 @@
+.PHONY: fish-speech
+fish-speech:
+	bash install.sh
+
+.PHONY: run
+run: fish-speech
+	@echo "Running fish-speech..."
+	bash run.sh
+	@echo "fish-speech run."
+
+.PHONY: test
+test: fish-speech
+	@echo "Testing fish-speech..."
+	bash test.sh
+	@echo "fish-speech tested."
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
diff --git a/backend/python/fish-speech/backend.py b/backend/python/fish-speech/backend.py
new file mode 100644
index 000000000..921b71efc
--- /dev/null
+++ b/backend/python/fish-speech/backend.py
@@ -0,0 +1,457 @@
+#!/usr/bin/env python3
+"""
+This is an extra gRPC server of LocalAI for fish-speech TTS
+"""
+
+from concurrent import futures
+import time
+import argparse
+import signal
+import sys
+import os
+import traceback
+import backend_pb2
+import backend_pb2_grpc
+import torch
+import soundfile as sf
+import numpy as np
+
+import json
+
+import grpc
+
+
+def is_float(s):
+    """Check if a string can be converted to float."""
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+
+def is_int(s):
+    """Check if a string can be converted to int."""
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get("PYTHON_GRPC_MAX_WORKERS", "1"))
+
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    BackendServicer is the class that implements the gRPC service
+    """
+
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", "utf-8"))
+
+    def LoadModel(self, request, context):
+        try:
+            # Get device
+            if torch.cuda.is_available():
+                print("CUDA is available", file=sys.stderr)
+                device = "cuda"
+            else:
+                print("CUDA is not available", file=sys.stderr)
+                device = "cpu"
+            mps_available = (
+                hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+            )
+            if mps_available:
+                device = "mps"
+            if not torch.cuda.is_available() and request.CUDA:
+                return backend_pb2.Result(success=False, message="CUDA is not available")
+
+            # Validate mps availability if requested
+            if device == "mps" and not torch.backends.mps.is_available():
+                print("Warning: MPS not available. Falling back to CPU.", file=sys.stderr)
+                device = "cpu"
+
+            self.device = device
+            self._torch_device = torch.device(device)
+
+            options = request.Options
+
+            # empty dict
+            self.options = {}
+
+            # The options are a list of strings in this form optname:optvalue
+            for opt in options:
+                if ":" not in opt:
+                    continue
+                key, value = opt.split(":", 1)
+                if is_float(value):
+                    value = float(value)
+                elif is_int(value):
+                    value = int(value)
+                elif value.lower() in ["true", "false"]:
+                    value = value.lower() == "true"
+                self.options[key] = value
+
+            # Parse voices configuration from options
+            self.voices = {}
+            if "voices" in self.options:
+                try:
+                    voices_data = self.options["voices"]
+                    if isinstance(voices_data, str):
+                        voices_list = json.loads(voices_data)
+                    else:
+                        voices_list = voices_data
+
+                    for voice_entry in voices_list:
+                        if not isinstance(voice_entry, dict):
+                            print(
+                                f"[WARNING] Invalid voice entry (not a dict): {voice_entry}",
+                                file=sys.stderr,
+                            )
+                            continue
+
+                        name = voice_entry.get("name")
+                        audio = voice_entry.get("audio")
+                        ref_text = voice_entry.get("ref_text", "")
+
+                        if not name or not isinstance(name, str):
+                            print(
+                                f"[WARNING] Voice entry missing required 'name' field: {voice_entry}",
+                                file=sys.stderr,
+                            )
+                            continue
+                        if not audio or not isinstance(audio, str):
+                            print(
+                                f"[WARNING] Voice entry missing required 'audio' field: {voice_entry}",
+                                file=sys.stderr,
+                            )
+                            continue
+
+                        self.voices[name] = {"audio": audio, "ref_text": ref_text}
+                        print(
+                            f"[INFO] Registered voice '{name}' with audio: {audio}",
+                            file=sys.stderr,
+                        )
+
+                    print(f"[INFO] Loaded {len(self.voices)} voice(s)", file=sys.stderr)
+                except json.JSONDecodeError as e:
+                    print(f"[ERROR] Failed to parse voices JSON: {e}", file=sys.stderr)
+                except Exception as e:
+                    print(
+                        f"[ERROR] Error processing voices configuration: {e}",
+                        file=sys.stderr,
+                    )
+                    print(traceback.format_exc(), file=sys.stderr)
+
+            # Store AudioPath, ModelFile, and ModelPath from LoadModel request
+            self.audio_path = (
+                request.AudioPath
+                if hasattr(request, "AudioPath") and request.AudioPath
+                else None
+            )
+            self.model_file = (
+                request.ModelFile
+                if hasattr(request, "ModelFile") and request.ModelFile
+                else None
+            )
+            self.model_path = (
+                request.ModelPath
+                if hasattr(request, "ModelPath") and request.ModelPath
+                else None
+            )
+
+            # Get model path from request
+            model_path = request.Model
+            if not model_path:
+                model_path = "fishaudio/s2-pro"
+
+            # If model_path looks like a HuggingFace repo ID (e.g. "fishaudio/fish-speech-1.5"),
+            # download it locally first since fish-speech expects a local directory
+            if "/" in model_path and not os.path.exists(model_path):
+                from huggingface_hub import snapshot_download
+
+                print(
+                    f"Downloading model from HuggingFace: {model_path}",
+                    file=sys.stderr,
+                )
+                model_path = snapshot_download(repo_id=model_path)
+                print(f"Model downloaded to: {model_path}", file=sys.stderr)
+
+            # Determine precision
+            if device in ("mps", "cpu"):
+                precision = torch.float32
+            else:
+                precision = torch.bfloat16
+
+            # Whether to use torch.compile
+            compile_model = self.options.get("compile", False)
+
+            print(
+                f"Using device: {device}, precision: {precision}, compile: {compile_model}",
+                file=sys.stderr,
+            )
+            print(f"Loading model from: {model_path}", file=sys.stderr)
+
+            # Import fish-speech modules
+            from fish_speech.inference_engine import TTSInferenceEngine
+            from fish_speech.models.dac.inference import load_model as load_decoder_model
+            from fish_speech.models.text2semantic.inference import (
+                launch_thread_safe_queue,
+            )
+
+            # Determine decoder checkpoint path
+            # The codec model is typically at <checkpoint_path>/codec.pth
+            decoder_checkpoint = self.options.get("decoder_checkpoint", None)
+            if not decoder_checkpoint:
+                # Try common locations
+                if os.path.isdir(model_path):
+                    candidate = os.path.join(model_path, "codec.pth")
+                    if os.path.exists(candidate):
+                        decoder_checkpoint = candidate
+
+            # Launch LLaMA queue (runs in daemon thread)
+            print("Launching LLaMA queue...", file=sys.stderr)
+            llama_queue = launch_thread_safe_queue(
+                checkpoint_path=model_path,
+                device=device,
+                precision=precision,
+                compile=compile_model,
+            )
+
+            # Load DAC decoder
+            decoder_config = self.options.get("decoder_config", "modded_dac_vq")
+            if not decoder_checkpoint:
+                return backend_pb2.Result(
+                    success=False,
+                    message="Decoder checkpoint (codec.pth) not found. "
+                    "Ensure the model directory contains codec.pth or set "
+                    "decoder_checkpoint option.",
+                )
+            print(
+                f"Loading DAC decoder (config={decoder_config}, checkpoint={decoder_checkpoint})...",
+                file=sys.stderr,
+            )
+            decoder_model = load_decoder_model(
+                config_name=decoder_config,
+                checkpoint_path=decoder_checkpoint,
+                device=device,
+            )
+
+            # Create TTS inference engine
+            self.engine = TTSInferenceEngine(
+                llama_queue=llama_queue,
+                decoder_model=decoder_model,
+                precision=precision,
+                compile=compile_model,
+            )
+
+            print(f"Model loaded successfully: {model_path}", file=sys.stderr)
+
+            return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+        except Exception as e:
+            print(f"[ERROR] Loading model: {type(e).__name__}: {e}", file=sys.stderr)
+            print(traceback.format_exc(), file=sys.stderr)
+            return backend_pb2.Result(
+                success=False, message=f"Failed to load model: {e}"
+            )
+
+    def _get_ref_audio_path(self, voice_name=None):
+        """Get reference audio path from voices dict or stored AudioPath."""
+        if voice_name and voice_name in self.voices:
+            audio_path = self.voices[voice_name]["audio"]
+
+            if os.path.isabs(audio_path):
+                return audio_path
+
+            # Try relative to ModelFile
+            if self.model_file:
+                model_file_base = os.path.dirname(self.model_file)
+                ref_path = os.path.join(model_file_base, audio_path)
+                if os.path.exists(ref_path):
+                    return ref_path
+
+            # Try relative to ModelPath
+            if self.model_path:
+                ref_path = os.path.join(self.model_path, audio_path)
+                if os.path.exists(ref_path):
+                    return ref_path
+
+            return audio_path
+
+        # Fall back to legacy single-voice mode
+        if not self.audio_path:
+            return None
+
+        if os.path.isabs(self.audio_path):
+            return self.audio_path
+
+        if self.model_file:
+            model_file_base = os.path.dirname(self.model_file)
+            ref_path = os.path.join(model_file_base, self.audio_path)
+            if os.path.exists(ref_path):
+                return ref_path
+
+        if self.model_path:
+            ref_path = os.path.join(self.model_path, self.audio_path)
+            if os.path.exists(ref_path):
+                return ref_path
+
+        return self.audio_path
+
+    def TTS(self, request, context):
+        try:
+            from fish_speech.utils.schema import ServeTTSRequest, ServeReferenceAudio
+
+            if not request.dst:
+                return backend_pb2.Result(
+                    success=False, message="dst (output path) is required"
+                )
+
+            text = request.text.strip()
+            if not text:
+                return backend_pb2.Result(success=False, message="Text is empty")
+
+            # Get generation parameters from options
+            top_p = self.options.get("top_p", 0.8)
+            temperature = self.options.get("temperature", 0.8)
+            repetition_penalty = self.options.get("repetition_penalty", 1.1)
+            max_new_tokens = self.options.get("max_new_tokens", 1024)
+            chunk_length = self.options.get("chunk_length", 200)
+
+            # Build references list for voice cloning
+            references = []
+            voice_name = request.voice if request.voice else None
+
+            if voice_name and voice_name in self.voices:
+                ref_audio_path = self._get_ref_audio_path(voice_name)
+                if ref_audio_path and os.path.exists(ref_audio_path):
+                    with open(ref_audio_path, "rb") as f:
+                        audio_bytes = f.read()
+                    ref_text = self.voices[voice_name].get("ref_text", "")
+                    references.append(
+                        ServeReferenceAudio(audio=audio_bytes, text=ref_text)
+                    )
+                    print(
+                        f"[INFO] Using voice '{voice_name}' with reference audio: {ref_audio_path}",
+                        file=sys.stderr,
+                    )
+            elif self.audio_path:
+                ref_audio_path = self._get_ref_audio_path()
+                if ref_audio_path and os.path.exists(ref_audio_path):
+                    with open(ref_audio_path, "rb") as f:
+                        audio_bytes = f.read()
+                    ref_text = self.options.get("ref_text", "")
+                    references.append(
+                        ServeReferenceAudio(audio=audio_bytes, text=ref_text)
+                    )
+                    print(
+                        f"[INFO] Using reference audio: {ref_audio_path}",
+                        file=sys.stderr,
+                    )
+
+            # Build ServeTTSRequest
+            tts_request = ServeTTSRequest(
+                text=text,
+                references=references,
+                top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                max_new_tokens=max_new_tokens,
+                chunk_length=chunk_length,
+            )
+
+            # Run inference
+            print(f"Generating speech for text: {text[:100]}...", file=sys.stderr)
+            start_time = time.time()
+
+            sample_rate = None
+            audio_data = None
+
+            for result in self.engine.inference(tts_request):
+                if result.code == "final":
+                    sample_rate, audio_data = result.audio
+                elif result.code == "error":
+                    error_msg = str(result.error) if result.error else "Unknown error"
+                    print(f"[ERROR] TTS inference error: {error_msg}", file=sys.stderr)
+                    return backend_pb2.Result(
+                        success=False, message=f"TTS inference error: {error_msg}"
+                    )
+
+            generation_duration = time.time() - start_time
+
+            if audio_data is None or sample_rate is None:
+                return backend_pb2.Result(
+                    success=False, message="No audio output generated"
+                )
+
+            # Ensure audio_data is a numpy array
+            if not isinstance(audio_data, np.ndarray):
+                audio_data = np.array(audio_data)
+
+            audio_duration = len(audio_data) / sample_rate if sample_rate > 0 else 0
+            print(
+                f"[INFO] TTS generation completed: {generation_duration:.2f}s, "
+                f"audio_duration={audio_duration:.2f}s, sample_rate={sample_rate}",
+                file=sys.stderr,
+                flush=True,
+            )
+
+            # Save output
+            sf.write(request.dst, audio_data, sample_rate)
+            print(f"Saved {audio_duration:.2f}s audio to {request.dst}", file=sys.stderr)
+
+        except Exception as err:
+            print(f"Error in TTS: {err}", file=sys.stderr)
+            print(traceback.format_exc(), file=sys.stderr)
+            return backend_pb2.Result(
+                success=False, message=f"Unexpected {err=}, {type(err)=}"
+            )
+
+        return backend_pb2.Result(success=True)
+
+
+def serve(address):
+    server = grpc.server(
+        futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ("grpc.max_message_length", 50 * 1024 * 1024),  # 50MB
+            ("grpc.max_send_message_length", 50 * 1024 * 1024),  # 50MB
+            ("grpc.max_receive_message_length", 50 * 1024 * 1024),  # 50MB
+        ],
+    )
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
diff --git a/backend/python/fish-speech/install.sh b/backend/python/fish-speech/install.sh
new file mode 100644
index 000000000..6e1ab8c95
--- /dev/null
+++ b/backend/python/fish-speech/install.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -e
+
+EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+# fish-speech uses pyrootutils which requires a .project-root marker
+touch "${backend_dir}/.project-root"
+
+installRequirements
+
+# Clone fish-speech source (the pip package doesn't include inference modules)
+FISH_SPEECH_DIR="${EDIR}/fish-speech-src"
+FISH_SPEECH_REPO="https://github.com/fishaudio/fish-speech.git"
+FISH_SPEECH_BRANCH="main"
+
+if [ ! -d "${FISH_SPEECH_DIR}" ]; then
+    echo "Cloning fish-speech source..."
+    git clone --depth 1 --branch "${FISH_SPEECH_BRANCH}" "${FISH_SPEECH_REPO}" "${FISH_SPEECH_DIR}"
+else
+    echo "Updating fish-speech source..."
+    cd "${FISH_SPEECH_DIR}" && git pull && cd -
+fi
+
+# Remove pyaudio from fish-speech deps — it's only used by the upstream client tool
+# (tools/api_client.py) for speaker playback, not by our gRPC backend server.
+# It requires native portaudio libs which aren't available on all build environments.
+sed -i.bak '/"pyaudio"/d' "${FISH_SPEECH_DIR}/pyproject.toml"
+
+# Install fish-speech deps from source (without the package itself since we use PYTHONPATH)
+ensureVenv
+if [ "x${USE_PIP}" == "xtrue" ]; then
+    pip install ${EXTRA_PIP_INSTALL_FLAGS:-} -e "${FISH_SPEECH_DIR}"
+else
+    uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} -e "${FISH_SPEECH_DIR}"
+fi
+
+# fish-speech transitive deps (wandb, tensorboard) may downgrade protobuf to 3.x
+# but our generated backend_pb2.py requires protobuf 5+
+ensureVenv
+if [ "x${USE_PIP}" == "xtrue" ]; then
+    pip install "protobuf>=5.29.0"
+else
+    uv pip install "protobuf>=5.29.0"
+fi
diff --git a/backend/python/fish-speech/package.sh b/backend/python/fish-speech/package.sh
new file mode 100755
index 000000000..afade34a6
--- /dev/null
+++ b/backend/python/fish-speech/package.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Script to package runtime libraries for the fish-speech backend
+# This is needed because the final Docker image is FROM scratch,
+# so system libraries must be explicitly included.
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+# Create lib directory
+mkdir -p $CURDIR/lib
+
+echo "fish-speech packaging completed successfully"
+ls -liah $CURDIR/lib/
diff --git a/backend/python/fish-speech/requirements-cpu.txt b/backend/python/fish-speech/requirements-cpu.txt
new file mode 100644
index 000000000..5c213d676
--- /dev/null
+++ b/backend/python/fish-speech/requirements-cpu.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch
+torchaudio
diff --git a/backend/python/fish-speech/requirements-cublas12.txt b/backend/python/fish-speech/requirements-cublas12.txt
new file mode 100644
index 000000000..5d66535c7
--- /dev/null
+++ b/backend/python/fish-speech/requirements-cublas12.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/cu121
+torch
+torchaudio
diff --git a/backend/python/fish-speech/requirements-cublas13.txt b/backend/python/fish-speech/requirements-cublas13.txt
new file mode 100644
index 000000000..c367ab45c
--- /dev/null
+++ b/backend/python/fish-speech/requirements-cublas13.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/cu130
+torch
+torchaudio
diff --git a/backend/python/fish-speech/requirements-hipblas.txt b/backend/python/fish-speech/requirements-hipblas.txt
new file mode 100644
index 000000000..81a30d412
--- /dev/null
+++ b/backend/python/fish-speech/requirements-hipblas.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.3
+torch==2.7.1+rocm6.3
+torchaudio==2.7.1+rocm6.3
diff --git a/backend/python/fish-speech/requirements-intel.txt b/backend/python/fish-speech/requirements-intel.txt
new file mode 100644
index 000000000..15509ba77
--- /dev/null
+++ b/backend/python/fish-speech/requirements-intel.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/xpu
+torch
+torchaudio
diff --git a/backend/python/fish-speech/requirements-l4t12.txt b/backend/python/fish-speech/requirements-l4t12.txt
new file mode 100644
index 000000000..36fb96068
--- /dev/null
+++ b/backend/python/fish-speech/requirements-l4t12.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
+torch
+torchaudio
diff --git a/backend/python/fish-speech/requirements-l4t13.txt b/backend/python/fish-speech/requirements-l4t13.txt
new file mode 100644
index 000000000..c367ab45c
--- /dev/null
+++ b/backend/python/fish-speech/requirements-l4t13.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/cu130
+torch
+torchaudio
diff --git a/backend/python/fish-speech/requirements-mps.txt b/backend/python/fish-speech/requirements-mps.txt
new file mode 100644
index 000000000..ff5c00f19
--- /dev/null
+++ b/backend/python/fish-speech/requirements-mps.txt
@@ -0,0 +1,2 @@
+torch
+torchaudio
diff --git a/backend/python/fish-speech/requirements.txt b/backend/python/fish-speech/requirements.txt
new file mode 100644
index 000000000..1be3c8250
--- /dev/null
+++ b/backend/python/fish-speech/requirements.txt
@@ -0,0 +1,9 @@
+grpcio==1.71.0
+protobuf
+certifi
+packaging==24.1
+soundfile
+setuptools
+six
+scipy
+numpy
diff --git a/backend/python/fish-speech/run.sh b/backend/python/fish-speech/run.sh
new file mode 100644
index 000000000..eae121f37
--- /dev/null
+++ b/backend/python/fish-speech/run.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+startBackend $@
diff --git a/backend/python/fish-speech/test.py b/backend/python/fish-speech/test.py
new file mode 100644
index 000000000..0831c6a1b
--- /dev/null
+++ b/backend/python/fish-speech/test.py
@@ -0,0 +1,175 @@
+"""
+A test script to test the gRPC service
+"""
+import signal
+import threading
+import unittest
+import subprocess
+import time
+import os
+import sys
+import tempfile
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+BACKEND_LOG = "/tmp/fish-speech-backend.log"
+
+
+def _dump_backend_log():
+    """Print backend log — call before exiting so CI always shows it."""
+    if os.path.exists(BACKEND_LOG):
+        with open(BACKEND_LOG, "r") as f:
+            contents = f.read()
+        if contents:
+            print("=== Backend Log ===", file=sys.stderr, flush=True)
+            print(contents, file=sys.stderr, flush=True)
+
+
+def _sigterm_handler(signum, frame):
+    """Handle SIGTERM so the backend log is printed before exit."""
+    print(f"\nReceived signal {signum}, dumping backend log before exit...",
+          file=sys.stderr, flush=True)
+    _dump_backend_log()
+    sys.exit(143)
+
+
+signal.signal(signal.SIGTERM, _sigterm_handler)
+
+
+def _tail_log(path, stop_event, interval=10):
+    """Background thread that periodically prints new lines from the backend log."""
+    pos = 0
+    while not stop_event.is_set():
+        stop_event.wait(interval)
+        try:
+            with open(path, "r") as f:
+                f.seek(pos)
+                new = f.read()
+                if new:
+                    print(f"[backend log] {new}", file=sys.stderr, end="", flush=True)
+                pos = f.tell()
+        except FileNotFoundError:
+            pass
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        print("Starting backend server...", file=sys.stderr, flush=True)
+        self.backend_log = open(BACKEND_LOG, "w")
+        self.service = subprocess.Popen(
+            ["python3", "backend.py", "--addr", "localhost:50051"],
+            stdout=self.backend_log,
+            stderr=self.backend_log,
+        )
+
+        # Start tailing backend log so CI sees progress in real time
+        self._log_stop = threading.Event()
+        self._log_thread = threading.Thread(
+            target=_tail_log, args=(BACKEND_LOG, self._log_stop), daemon=True
+        )
+        self._log_thread.start()
+
+        # Poll for readiness instead of a fixed sleep
+        print("Waiting for backend to be ready...", file=sys.stderr, flush=True)
+        max_wait = 60
+        start = time.time()
+        ready = False
+        while time.time() - start < max_wait:
+            try:
+                with grpc.insecure_channel("localhost:50051") as channel:
+                    stub = backend_pb2_grpc.BackendStub(channel)
+                    resp = stub.Health(backend_pb2.HealthMessage(), timeout=2.0)
+                    if resp.message:
+                        ready = True
+                        break
+            except Exception:
+                pass
+            # Check if process died
+            if self.service.poll() is not None:
+                self.fail(f"Backend process exited early with code {self.service.returncode}")
+            time.sleep(2)
+
+        elapsed = time.time() - start
+        if not ready:
+            self.fail(f"Backend not ready after {max_wait}s")
+        print(f"Backend ready after {elapsed:.1f}s", file=sys.stderr, flush=True)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self._log_stop.set()
+        self._log_thread.join(timeout=2)
+        self.service.terminate()
+        try:
+            self.service.wait(timeout=5)
+        except subprocess.TimeoutExpired:
+            self.service.kill()
+            self.service.wait()
+        self.backend_log.close()
+        _dump_backend_log()
+
+    def test_tts(self):
+        """
+        This method tests if the TTS generation works successfully
+        """
+        with grpc.insecure_channel("localhost:50051") as channel:
+            stub = backend_pb2_grpc.BackendStub(channel)
+            # Limit max_new_tokens for CPU testing (generation is very slow on CPU)
+            print("Loading model fishaudio/s2-pro...", file=sys.stderr, flush=True)
+            load_start = time.time()
+            response = stub.LoadModel(
+                backend_pb2.ModelOptions(
+                    Model="fishaudio/s2-pro",
+                    Options=["max_new_tokens:50"],
+                ),
+                timeout=1800.0
+            )
+            print(
+                f"LoadModel response: success={response.success}, "
+                f"message={response.message}, "
+                f"took {time.time() - load_start:.1f}s",
+                file=sys.stderr, flush=True
+            )
+            self.assertTrue(response.success, f"LoadModel failed: {response.message}")
+
+            # Create temporary output file
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
+                output_path = tmp_file.name
+
+            tts_request = backend_pb2.TTSRequest(
+                text="Hi.",
+                dst=output_path
+            )
+            # Allow up to 10 minutes for TTS generation on CPU
+            print("Starting TTS generation...", file=sys.stderr, flush=True)
+            tts_start = time.time()
+            tts_response = stub.TTS(tts_request, timeout=600.0)
+            print(
+                f"TTS response: success={tts_response.success}, "
+                f"took {time.time() - tts_start:.1f}s",
+                file=sys.stderr, flush=True
+            )
+            self.assertIsNotNone(tts_response)
+            self.assertTrue(tts_response.success)
+
+            # Verify output file exists and is not empty
+            self.assertTrue(os.path.exists(output_path))
+            file_size = os.path.getsize(output_path)
+            print(f"Output file size: {file_size} bytes", file=sys.stderr, flush=True)
+            self.assertGreater(file_size, 0)
+
+            # Cleanup
+            os.unlink(output_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backend/python/fish-speech/test.sh b/backend/python/fish-speech/test.sh
new file mode 100644
index 000000000..eb59f2aaf
--- /dev/null
+++ b/backend/python/fish-speech/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+runUnittests
diff --git a/gallery/index.yaml b/gallery/index.yaml
index 1b09fab7c..860c23e6c 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -958,6 +958,25 @@
       voice: Aiden # Available speakers: Vivian, Serena, Uncle_Fu, Dylan, Eric, Ryan, Aiden, Ono_Anna, Sohee
     parameters:
       model: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice
+- &fish-speech
+  urls:
+    - https://huggingface.co/fishaudio/s2-pro
+  description: |
+    Fish Speech S2-Pro is a high-quality text-to-speech model supporting voice cloning via reference audio. Uses a two-stage pipeline: text to semantic tokens (LLaMA-based) then semantic to audio (DAC decoder).
+  tags:
+    - text-to-speech
+    - TTS
+    - voice-cloning
+  license: apache-2.0
+  icon: https://huggingface.co/fishaudio/s2-pro/resolve/main/overview.png
+  name: "fish-speech-s2-pro"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  overrides:
+    backend: fish-speech
+    known_usecases:
+      - tts
+    parameters:
+      model: fishaudio/s2-pro
 - &qwen-asr
   urls:
     - https://huggingface.co/Qwen/Qwen3-ASR-1.7B