feat: add VoxCPM tts backend (#8109)

* feat: add VoxCPM tts backend Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Disable voxcpm on arm64 cpu Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-25 01:02:05 -04:00 · 2026-01-28 14:44:04 +01:00
parent cba8ef4e38
commit 9b973b79f6
21 changed files with 599 additions and 21 deletions
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -118,6 +118,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "9"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-12-voxcpm'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "voxcpm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "9"
@@ -366,6 +379,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-voxcpm'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "voxcpm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: 'cublas'
            cuda-major-version: "13"
            cuda-minor-version: "0"
@@ -719,6 +745,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-rocm-hipblas-voxcpm'
+            runs-on: 'arc-runner-set'
+            base-image: "rocm/dev-ubuntu-24.04:6.4.4"
+            skip-drivers: 'false'
+            backend: "voxcpm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: 'hipblas'
            cuda-major-version: ""
            cuda-minor-version: ""
@@ -942,6 +981,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'intel'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-intel-voxcpm'
+            runs-on: 'arc-runner-set'
+            base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+            skip-drivers: 'false'
+            backend: "voxcpm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: 'intel'
            cuda-major-version: ""
            cuda-minor-version: ""
@@ -1341,6 +1393,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-cpu-voxcpm'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "voxcpm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: ''
            cuda-major-version: ""
            cuda-minor-version: ""
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -304,22 +304,22 @@ jobs:
        run: |
          make --jobs=5 --output-sync=target -C backend/python/qwen-tts
          make --jobs=5 --output-sync=target -C backend/python/qwen-tts test
-  # tests-vibevoice:
-  #   runs-on: bigger-runner
-  #   steps:
-  #     - name: Clone
-  #       uses: actions/checkout@v6
-  #       with:
-  #         submodules: true
-  #     - name: Dependencies
-  #       run: |
-  #         sudo apt-get update
-  #         sudo apt-get install -y build-essential ffmpeg
-  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip wget
-  #         # Install UV
-  #         curl -LsSf https://astral.sh/uv/install.sh | sh
-  #         pip install --user --no-cache-dir --break-system-packages grpcio-tools==1.64.1
-  #     - name: Test vibevoice
-  #       run: |
-  #         make --jobs=5 --output-sync=target -C backend/python/vibevoice
-  #         make --jobs=5 --output-sync=target -C backend/python/vibevoice test
+  tests-voxcpm:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
+      - name: Test voxcpm
+        run: |
+          make --jobs=5 --output-sync=target -C backend/python/voxcpm
+          make --jobs=5 --output-sync=target -C backend/python/voxcpm test
--- a/8
+++ b/8
@@ -1,5 +1,5 @@
 # Disable parallel execution for backend builds
-.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts
+.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/voxcpm

 GOCMD=go
 GOTEST=$(GOCMD) test
@@ -319,6 +319,7 @@ prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/moonshine
 	$(MAKE) -C backend/python/pocket-tts
 	$(MAKE) -C backend/python/qwen-tts
+	$(MAKE) -C backend/python/voxcpm

 test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
@@ -330,6 +331,7 @@ test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/moonshine test
 	$(MAKE) -C backend/python/pocket-tts test
 	$(MAKE) -C backend/python/qwen-tts test
+	$(MAKE) -C backend/python/voxcpm test

 DOCKER_IMAGE?=local-ai
 DOCKER_AIO_IMAGE?=local-ai-aio
@@ -462,6 +464,7 @@ BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true
 BACKEND_MOONSHINE = moonshine|python|.|false|true
 BACKEND_POCKET_TTS = pocket-tts|python|.|false|true
 BACKEND_QWEN_TTS = qwen-tts|python|.|false|true
+BACKEND_VOXCPM = voxcpm|python|.|false|true

 # Helper function to build docker image for a backend
 # Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG)
@@ -507,12 +510,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS)))
 $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS)))
+$(eval $(call generate-docker-build-target,$(BACKEND_VOXCPM)))

 # Pattern rule for docker-save targets
 docker-save-%: backend-images
 	docker save local-ai-backend:$* -o backend-images/$*.tar

-docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts
+docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-voxcpm

 ########################################################
 ### END Backends
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -414,6 +414,25 @@
    nvidia-l4t-cuda-12: "nvidia-l4t-qwen-tts"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen-tts"
  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png
+- &voxcpm
+  urls:
+    - https://github.com/ModelBest/VoxCPM
+  description: |
+    VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech.
+  tags:
+    - text-to-speech
+    - TTS
+  license: mit
+  name: "voxcpm"
+  alias: "voxcpm"
+  capabilities:
+    nvidia: "cuda12-voxcpm"
+    intel: "intel-voxcpm"
+    amd: "rocm-voxcpm"
+    default: "cpu-voxcpm"
+    nvidia-cuda-13: "cuda13-voxcpm"
+    nvidia-cuda-12: "cuda12-voxcpm"
+  icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4
 - &pocket-tts
  urls:
    - https://github.com/kyutai-labs/pocket-tts
@@ -1652,6 +1671,66 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-tts"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-tts
+## voxcpm
+- !!merge <<: *voxcpm
+  name: "voxcpm-development"
+  capabilities:
+    nvidia: "cuda12-voxcpm-development"
+    intel: "intel-voxcpm-development"
+    amd: "rocm-voxcpm-development"
+    default: "cpu-voxcpm-development"
+    nvidia-cuda-13: "cuda13-voxcpm-development"
+    nvidia-cuda-12: "cuda12-voxcpm-development"
+- !!merge <<: *voxcpm
+  name: "cpu-voxcpm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voxcpm"
+  mirrors:
+    - localai/localai-backends:latest-cpu-voxcpm
+- !!merge <<: *voxcpm
+  name: "cpu-voxcpm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voxcpm"
+  mirrors:
+    - localai/localai-backends:master-cpu-voxcpm
+- !!merge <<: *voxcpm
+  name: "cuda12-voxcpm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voxcpm"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-voxcpm
+- !!merge <<: *voxcpm
+  name: "cuda12-voxcpm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voxcpm"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-voxcpm
+- !!merge <<: *voxcpm
+  name: "cuda13-voxcpm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voxcpm"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-voxcpm
+- !!merge <<: *voxcpm
+  name: "cuda13-voxcpm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voxcpm"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-voxcpm
+- !!merge <<: *voxcpm
+  name: "intel-voxcpm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-voxcpm"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-voxcpm
+- !!merge <<: *voxcpm
+  name: "intel-voxcpm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-voxcpm"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-voxcpm
+- !!merge <<: *voxcpm
+  name: "rocm-voxcpm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voxcpm"
+  mirrors:
+    - localai/localai-backends:latest-gpu-rocm-hipblas-voxcpm
+- !!merge <<: *voxcpm
+  name: "rocm-voxcpm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voxcpm"
+  mirrors:
+    - localai/localai-backends:master-gpu-rocm-hipblas-voxcpm
 ## pocket-tts
 - !!merge <<: *pocket-tts
  name: "pocket-tts-development"
--- a/backend/python/voxcpm/Makefile
+++ b/backend/python/voxcpm/Makefile
@@ -0,0 +1,23 @@
+.PHONY: voxcpm
+voxcpm:
+	bash install.sh
+
+.PHONY: run
+run: voxcpm
+	@echo "Running voxcpm..."
+	bash run.sh
+	@echo "voxcpm run."
+
+.PHONY: test
+test: voxcpm
+	@echo "Testing voxcpm..."
+	bash test.sh
+	@echo "voxcpm tested."
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
--- a/backend/python/voxcpm/backend.py
+++ b/backend/python/voxcpm/backend.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+"""
+This is an extra gRPC server of LocalAI for VoxCPM
+"""
+from concurrent import futures
+import time
+import argparse
+import signal
+import sys
+import os
+import traceback
+import numpy as np
+import soundfile as sf
+from voxcpm import VoxCPM
+
+import backend_pb2
+import backend_pb2_grpc
+import torch
+
+import grpc
+
+def is_float(s):
+    """Check if a string can be converted to float."""
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+def is_int(s):
+    """Check if a string can be converted to int."""
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    BackendServicer is the class that implements the gRPC service
+    """
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    
+    def LoadModel(self, request, context):
+        # Get device
+        if torch.cuda.is_available():
+            print("CUDA is available", file=sys.stderr)
+            device = "cuda"
+        else:
+            print("CUDA is not available", file=sys.stderr)
+            device = "cpu"
+        mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        if mps_available:
+            device = "mps"
+        if not torch.cuda.is_available() and request.CUDA:
+            return backend_pb2.Result(success=False, message="CUDA is not available")
+
+        # Normalize potential 'mpx' typo to 'mps'
+        if device == "mpx":
+            print("Note: device 'mpx' detected, treating it as 'mps'.", file=sys.stderr)
+            device = "mps"
+        
+        # Validate mps availability if requested
+        if device == "mps" and not torch.backends.mps.is_available():
+            print("Warning: MPS not available. Falling back to CPU.", file=sys.stderr)
+            device = "cpu"
+
+        self.device = device
+
+        options = request.Options
+
+        # empty dict
+        self.options = {}
+
+        # The options are a list of strings in this form optname:optvalue
+        # We are storing all the options in a dict so we can use it later when
+        # generating the audio
+        for opt in options:
+            if ":" not in opt:
+                continue
+            key, value = opt.split(":", 1)  # Split only on first colon
+            # if value is a number, convert it to the appropriate type
+            if is_float(value):
+                value = float(value)
+            elif is_int(value):
+                value = int(value)
+            elif value.lower() in ["true", "false"]:
+                value = value.lower() == "true"
+            self.options[key] = value
+
+        # Get model path from request
+        model_path = request.Model
+        if not model_path:
+            model_path = "openbmb/VoxCPM1.5"
+        
+        try:
+            print(f"Loading model from {model_path}", file=sys.stderr)
+            self.model = VoxCPM.from_pretrained(model_path)
+            print(f"Model loaded successfully on device: {self.device}", file=sys.stderr)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def TTS(self, request, context):
+        try:
+            # Get generation parameters from options with defaults
+            cfg_value = self.options.get("cfg_value", 2.0)
+            inference_timesteps = self.options.get("inference_timesteps", 10)
+            normalize = self.options.get("normalize", False)
+            denoise = self.options.get("denoise", False)
+            retry_badcase = self.options.get("retry_badcase", True)
+            retry_badcase_max_times = self.options.get("retry_badcase_max_times", 3)
+            retry_badcase_ratio_threshold = self.options.get("retry_badcase_ratio_threshold", 6.0)
+            use_streaming = self.options.get("streaming", False)
+
+            # Handle voice cloning via prompt_wav_path and prompt_text
+            prompt_wav_path = None
+            prompt_text = None
+
+            # Priority: request.voice > AudioPath > options
+            if hasattr(request, 'voice') and request.voice:
+                # If voice is provided, try to use it as a path
+                if os.path.exists(request.voice):
+                    prompt_wav_path = request.voice
+                elif hasattr(request, 'ModelFile') and request.ModelFile:
+                    model_file_base = os.path.dirname(request.ModelFile)
+                    potential_path = os.path.join(model_file_base, request.voice)
+                    if os.path.exists(potential_path):
+                        prompt_wav_path = potential_path
+                elif hasattr(request, 'ModelPath') and request.ModelPath:
+                    potential_path = os.path.join(request.ModelPath, request.voice)
+                    if os.path.exists(potential_path):
+                        prompt_wav_path = potential_path
+
+            if hasattr(request, 'AudioPath') and request.AudioPath:
+                if os.path.isabs(request.AudioPath):
+                    prompt_wav_path = request.AudioPath
+                elif hasattr(request, 'ModelFile') and request.ModelFile:
+                    model_file_base = os.path.dirname(request.ModelFile)
+                    prompt_wav_path = os.path.join(model_file_base, request.AudioPath)
+                elif hasattr(request, 'ModelPath') and request.ModelPath:
+                    prompt_wav_path = os.path.join(request.ModelPath, request.AudioPath)
+                else:
+                    prompt_wav_path = request.AudioPath
+
+            # Get prompt_text from options if available
+            if "prompt_text" in self.options:
+                prompt_text = self.options["prompt_text"]
+
+            # Prepare text
+            text = request.text.strip()
+
+            print(f"Generating audio with cfg_value: {cfg_value}, inference_timesteps: {inference_timesteps}, streaming: {use_streaming}", file=sys.stderr)
+
+            # Generate audio
+            if use_streaming:
+                # Streaming generation
+                chunks = []
+                for chunk in self.model.generate_streaming(
+                    text=text,
+                    prompt_wav_path=prompt_wav_path,
+                    prompt_text=prompt_text,
+                    cfg_value=cfg_value,
+                    inference_timesteps=inference_timesteps,
+                    normalize=normalize,
+                    denoise=denoise,
+                    retry_badcase=retry_badcase,
+                    retry_badcase_max_times=retry_badcase_max_times,
+                    retry_badcase_ratio_threshold=retry_badcase_ratio_threshold,
+                ):
+                    chunks.append(chunk)
+                wav = np.concatenate(chunks)
+            else:
+                # Non-streaming generation
+                wav = self.model.generate(
+                    text=text,
+                    prompt_wav_path=prompt_wav_path,
+                    prompt_text=prompt_text,
+                    cfg_value=cfg_value,
+                    inference_timesteps=inference_timesteps,
+                    normalize=normalize,
+                    denoise=denoise,
+                    retry_badcase=retry_badcase,
+                    retry_badcase_max_times=retry_badcase_max_times,
+                    retry_badcase_ratio_threshold=retry_badcase_ratio_threshold,
+                )
+
+            # Get sample rate from model
+            sample_rate = self.model.tts_model.sample_rate
+
+            # Save output
+            sf.write(request.dst, wav, sample_rate)
+            print(f"Saved output to {request.dst}", file=sys.stderr)
+
+        except Exception as err:
+            print(f"Error in TTS: {err}", file=sys.stderr)
+            print(traceback.format_exc(), file=sys.stderr)
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        
+        return backend_pb2.Result(success=True)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/backend/python/voxcpm/install.sh
+++ b/backend/python/voxcpm/install.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+installRequirements
+
+# Apply patch to fix PyTorch compatibility issue in voxcpm
+# This fixes the "Dimension out of range" error in scaled_dot_product_attention
+# by changing .contiguous() to .unsqueeze(0) in the attention module
+# The patch is needed because voxcpm's initialization test generation fails with
+# certain PyTorch versions due to a bug in scaled_dot_product_attention
+# https://github.com/OpenBMB/VoxCPM/issues/71#issuecomment-3441789452
+VOXCPM_PATH=$(python -c "import voxcpm; import os; print(os.path.dirname(voxcpm.__file__))" 2>/dev/null || echo "")
+if [ -n "$VOXCPM_PATH" ] && [ -f "$VOXCPM_PATH/modules/minicpm4/model.py" ]; then
+    echo "Applying patch to voxcpm at $VOXCPM_PATH/modules/minicpm4/model.py"
+    # Replace .contiguous() with .unsqueeze(0) for the three lines in the attention forward_step method
+    # This fixes the dimension error in scaled_dot_product_attention
+    sed -i 's/query_states = query_states\.contiguous()/query_states = query_states.unsqueeze(0)/g' "$VOXCPM_PATH/modules/minicpm4/model.py"
+    sed -i 's/key_cache = key_cache\.contiguous()/key_cache = key_cache.unsqueeze(0)/g' "$VOXCPM_PATH/modules/minicpm4/model.py"
+    sed -i 's/value_cache = value_cache\.contiguous()/value_cache = value_cache.unsqueeze(0)/g' "$VOXCPM_PATH/modules/minicpm4/model.py"
+    echo "Patch applied successfully"
+else
+    echo "Warning: Could not find voxcpm installation to apply patch (path: ${VOXCPM_PATH:-not found})"
+fi
--- a/backend/python/voxcpm/protogen.sh
+++ b/backend/python/voxcpm/protogen.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+runProtogen
--- a/backend/python/voxcpm/requirements-cpu.txt
+++ b/backend/python/voxcpm/requirements-cpu.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch
+soundfile
+numpy
+voxcpm
+torchcodec
--- a/backend/python/voxcpm/requirements-cublas12.txt
+++ b/backend/python/voxcpm/requirements-cublas12.txt
@@ -0,0 +1,5 @@
+--extra-index-url https://download.pytorch.org/whl/cu121
+torch
+soundfile
+numpy
+voxcpm
--- a/backend/python/voxcpm/requirements-cublas13.txt
+++ b/backend/python/voxcpm/requirements-cublas13.txt
@@ -0,0 +1,5 @@
+--extra-index-url https://download.pytorch.org/whl/cu130
+torch
+soundfile
+numpy
+voxcpm
--- a/backend/python/voxcpm/requirements-hipblas.txt
+++ b/backend/python/voxcpm/requirements-hipblas.txt
@@ -0,0 +1,5 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.3
+torch==2.7.1+rocm6.3
+soundfile
+numpy
+voxcpm
--- a/backend/python/voxcpm/requirements-intel.txt
+++ b/backend/python/voxcpm/requirements-intel.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://download.pytorch.org/whl/xpu
+torch
+setuptools
+soundfile
+numpy
+voxcpm
--- a/backend/python/voxcpm/requirements-l4t12.txt
+++ b/backend/python/voxcpm/requirements-l4t12.txt
@@ -0,0 +1,5 @@
+--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
+torch
+soundfile
+numpy
+voxcpm
--- a/backend/python/voxcpm/requirements-l4t13.txt
+++ b/backend/python/voxcpm/requirements-l4t13.txt
@@ -0,0 +1,5 @@
+--extra-index-url https://download.pytorch.org/whl/cu130
+torch
+soundfile
+numpy
+voxcpm
--- a/backend/python/voxcpm/requirements-mps.txt
+++ b/backend/python/voxcpm/requirements-mps.txt
@@ -0,0 +1,4 @@
+torch
+soundfile
+numpy
+voxcpm
--- a/backend/python/voxcpm/requirements.txt
+++ b/backend/python/voxcpm/requirements.txt
@@ -0,0 +1,7 @@
+grpcio==1.76.0
+protobuf
+certifi
+packaging==24.1
+soundfile
+numpy
+voxcpm
--- a/backend/python/voxcpm/run.sh
+++ b/backend/python/voxcpm/run.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+startBackend $@
--- a/backend/python/voxcpm/test.py
+++ b/backend/python/voxcpm/test.py
@@ -0,0 +1,51 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
+        time.sleep(30)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.terminate()
+        self.service.wait()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            print("Starting test_load_model")
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="openbmb/VoxCPM1.5"))
+                print(response)
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+                tts_request = backend_pb2.TTSRequest(text="VoxCPM is an innovative end-to-end TTS model from ModelBest.", dst="test.wav")
+                tts_response = stub.TTS(tts_request)
+                self.assertIsNotNone(tts_response)
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/voxcpm/test.sh
+++ b/backend/python/voxcpm/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+runUnittests
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -491,6 +491,8 @@ func (re *RequestExtractor) SetOpenResponsesRequest(c echo.Context) error {
 		return echo.ErrBadRequest
 	}

+	// Convert input items to Messages (this will be done in the endpoint handler)
+	// We store the input in the request for the endpoint to process
 	cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
 	if !ok || cfg == nil {
 		return echo.ErrBadRequest