From 9b973b79f6fcbc7423c3f970aab0c395f7d23c1e Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 28 Jan 2026 14:44:04 +0100 Subject: [PATCH] feat: add VoxCPM tts backend (#8109) * feat: add VoxCPM tts backend Signed-off-by: Ettore Di Giacinto * Disable voxcpm on arm64 cpu Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 65 +++++ .github/workflows/test-extra.yml | 38 +-- Makefile | 8 +- backend/index.yaml | 79 ++++++ backend/python/voxcpm/Makefile | 23 ++ backend/python/voxcpm/backend.py | 245 ++++++++++++++++++ backend/python/voxcpm/install.sh | 30 +++ backend/python/voxcpm/protogen.sh | 11 + backend/python/voxcpm/requirements-cpu.txt | 6 + .../python/voxcpm/requirements-cublas12.txt | 5 + .../python/voxcpm/requirements-cublas13.txt | 5 + .../python/voxcpm/requirements-hipblas.txt | 5 + backend/python/voxcpm/requirements-intel.txt | 6 + backend/python/voxcpm/requirements-l4t12.txt | 5 + backend/python/voxcpm/requirements-l4t13.txt | 5 + backend/python/voxcpm/requirements-mps.txt | 4 + backend/python/voxcpm/requirements.txt | 7 + backend/python/voxcpm/run.sh | 9 + backend/python/voxcpm/test.py | 51 ++++ backend/python/voxcpm/test.sh | 11 + core/http/middleware/request.go | 2 + 21 files changed, 599 insertions(+), 21 deletions(-) create mode 100644 backend/python/voxcpm/Makefile create mode 100644 backend/python/voxcpm/backend.py create mode 100755 backend/python/voxcpm/install.sh create mode 100755 backend/python/voxcpm/protogen.sh create mode 100644 backend/python/voxcpm/requirements-cpu.txt create mode 100644 backend/python/voxcpm/requirements-cublas12.txt create mode 100644 backend/python/voxcpm/requirements-cublas13.txt create mode 100644 backend/python/voxcpm/requirements-hipblas.txt create mode 100644 backend/python/voxcpm/requirements-intel.txt create mode 100644 backend/python/voxcpm/requirements-l4t12.txt create mode 100644 backend/python/voxcpm/requirements-l4t13.txt create mode 100644 backend/python/voxcpm/requirements-mps.txt create mode 100644 backend/python/voxcpm/requirements.txt create mode 100755 backend/python/voxcpm/run.sh create mode 100644 backend/python/voxcpm/test.py create mode 100755 backend/python/voxcpm/test.sh diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 6389e8988..a7813393e 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -118,6 +118,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "9" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-voxcpm' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voxcpm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "9" @@ -366,6 +379,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-voxcpm' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voxcpm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -719,6 +745,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-voxcpm' + runs-on: 'arc-runner-set' + base-image: "rocm/dev-ubuntu-24.04:6.4.4" + skip-drivers: 'false' + backend: "voxcpm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: "" @@ -942,6 +981,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'intel' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-voxcpm' + runs-on: 'arc-runner-set' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "voxcpm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'intel' cuda-major-version: "" cuda-minor-version: "" @@ -1341,6 +1393,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-voxcpm' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voxcpm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: '' cuda-major-version: "" cuda-minor-version: "" diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index abb8fa376..945beafb5 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -304,22 +304,22 @@ jobs: run: | make --jobs=5 --output-sync=target -C backend/python/qwen-tts make --jobs=5 --output-sync=target -C backend/python/qwen-tts test - # tests-vibevoice: - # runs-on: bigger-runner - # steps: - # - name: Clone - # uses: actions/checkout@v6 - # with: - # submodules: true - # - name: Dependencies - # run: | - # sudo apt-get update - # sudo apt-get install -y build-essential ffmpeg - # sudo apt-get install -y ca-certificates cmake curl patch python3-pip wget - # # Install UV - # curl -LsSf https://astral.sh/uv/install.sh | sh - # pip install --user --no-cache-dir --break-system-packages grpcio-tools==1.64.1 - # - name: Test vibevoice - # run: | - # make --jobs=5 --output-sync=target -C backend/python/vibevoice - # make --jobs=5 --output-sync=target -C backend/python/vibevoice test \ No newline at end of file + tests-voxcpm: + runs-on: ubuntu-latest + steps: + - name: Clone + uses: actions/checkout@v6 + with: + submodules: true + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install build-essential ffmpeg + sudo apt-get install -y ca-certificates cmake curl patch python3-pip + # Install UV + curl -LsSf https://astral.sh/uv/install.sh | sh + pip install --user --no-cache-dir grpcio-tools==1.64.1 + - name: Test voxcpm + run: | + make --jobs=5 --output-sync=target -C backend/python/voxcpm + make --jobs=5 --output-sync=target -C backend/python/voxcpm test diff --git a/Makefile b/Makefile index 4a8af4f4e..6580d7486 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/voxcpm GOCMD=go GOTEST=$(GOCMD) test @@ -319,6 +319,7 @@ prepare-test-extra: protogen-python $(MAKE) -C backend/python/moonshine $(MAKE) -C backend/python/pocket-tts $(MAKE) -C backend/python/qwen-tts + $(MAKE) -C backend/python/voxcpm test-extra: prepare-test-extra $(MAKE) -C backend/python/transformers test @@ -330,6 +331,7 @@ test-extra: prepare-test-extra $(MAKE) -C backend/python/moonshine test $(MAKE) -C backend/python/pocket-tts test $(MAKE) -C backend/python/qwen-tts test + $(MAKE) -C backend/python/voxcpm test DOCKER_IMAGE?=local-ai DOCKER_AIO_IMAGE?=local-ai-aio @@ -462,6 +464,7 @@ BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true BACKEND_MOONSHINE = moonshine|python|.|false|true BACKEND_POCKET_TTS = pocket-tts|python|.|false|true BACKEND_QWEN_TTS = qwen-tts|python|.|false|true +BACKEND_VOXCPM = voxcpm|python|.|false|true # Helper function to build docker image for a backend # Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG) @@ -507,12 +510,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE))) $(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE))) $(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS))) $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS))) +$(eval $(call generate-docker-build-target,$(BACKEND_VOXCPM))) # Pattern rule for docker-save targets docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts +docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-voxcpm ######################################################## ### END Backends diff --git a/backend/index.yaml b/backend/index.yaml index d3dcb870d..50c1b0288 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -414,6 +414,25 @@ nvidia-l4t-cuda-12: "nvidia-l4t-qwen-tts" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen-tts" icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png +- &voxcpm + urls: + - https://github.com/ModelBest/VoxCPM + description: | + VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech. + tags: + - text-to-speech + - TTS + license: mit + name: "voxcpm" + alias: "voxcpm" + capabilities: + nvidia: "cuda12-voxcpm" + intel: "intel-voxcpm" + amd: "rocm-voxcpm" + default: "cpu-voxcpm" + nvidia-cuda-13: "cuda13-voxcpm" + nvidia-cuda-12: "cuda12-voxcpm" + icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4 - &pocket-tts urls: - https://github.com/kyutai-labs/pocket-tts @@ -1652,6 +1671,66 @@ uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-tts" mirrors: - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-tts +## voxcpm +- !!merge <<: *voxcpm + name: "voxcpm-development" + capabilities: + nvidia: "cuda12-voxcpm-development" + intel: "intel-voxcpm-development" + amd: "rocm-voxcpm-development" + default: "cpu-voxcpm-development" + nvidia-cuda-13: "cuda13-voxcpm-development" + nvidia-cuda-12: "cuda12-voxcpm-development" +- !!merge <<: *voxcpm + name: "cpu-voxcpm" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voxcpm" + mirrors: + - localai/localai-backends:latest-cpu-voxcpm +- !!merge <<: *voxcpm + name: "cpu-voxcpm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voxcpm" + mirrors: + - localai/localai-backends:master-cpu-voxcpm +- !!merge <<: *voxcpm + name: "cuda12-voxcpm" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voxcpm" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-voxcpm +- !!merge <<: *voxcpm + name: "cuda12-voxcpm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voxcpm" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-voxcpm +- !!merge <<: *voxcpm + name: "cuda13-voxcpm" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voxcpm" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-voxcpm +- !!merge <<: *voxcpm + name: "cuda13-voxcpm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voxcpm" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-voxcpm +- !!merge <<: *voxcpm + name: "intel-voxcpm" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-voxcpm" + mirrors: + - localai/localai-backends:latest-gpu-intel-voxcpm +- !!merge <<: *voxcpm + name: "intel-voxcpm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-voxcpm" + mirrors: + - localai/localai-backends:master-gpu-intel-voxcpm +- !!merge <<: *voxcpm + name: "rocm-voxcpm" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voxcpm" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-voxcpm +- !!merge <<: *voxcpm + name: "rocm-voxcpm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voxcpm" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-voxcpm ## pocket-tts - !!merge <<: *pocket-tts name: "pocket-tts-development" diff --git a/backend/python/voxcpm/Makefile b/backend/python/voxcpm/Makefile new file mode 100644 index 000000000..bfcf684ae --- /dev/null +++ b/backend/python/voxcpm/Makefile @@ -0,0 +1,23 @@ +.PHONY: voxcpm +voxcpm: + bash install.sh + +.PHONY: run +run: voxcpm + @echo "Running voxcpm..." + bash run.sh + @echo "voxcpm run." + +.PHONY: test +test: voxcpm + @echo "Testing voxcpm..." + bash test.sh + @echo "voxcpm tested." + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +.PHONY: clean +clean: protogen-clean + rm -rf venv __pycache__ diff --git a/backend/python/voxcpm/backend.py b/backend/python/voxcpm/backend.py new file mode 100644 index 000000000..84bb99e96 --- /dev/null +++ b/backend/python/voxcpm/backend.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 +""" +This is an extra gRPC server of LocalAI for VoxCPM +""" +from concurrent import futures +import time +import argparse +import signal +import sys +import os +import traceback +import numpy as np +import soundfile as sf +from voxcpm import VoxCPM + +import backend_pb2 +import backend_pb2_grpc +import torch + +import grpc + +def is_float(s): + """Check if a string can be converted to float.""" + try: + float(s) + return True + except ValueError: + return False + +def is_int(s): + """Check if a string can be converted to int.""" + try: + int(s) + return True + except ValueError: + return False + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + +# Implement the BackendServicer class with the service methods +class BackendServicer(backend_pb2_grpc.BackendServicer): + """ + BackendServicer is the class that implements the gRPC service + """ + def Health(self, request, context): + return backend_pb2.Reply(message=bytes("OK", 'utf-8')) + + def LoadModel(self, request, context): + # Get device + if torch.cuda.is_available(): + print("CUDA is available", file=sys.stderr) + device = "cuda" + else: + print("CUDA is not available", file=sys.stderr) + device = "cpu" + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" + if not torch.cuda.is_available() and request.CUDA: + return backend_pb2.Result(success=False, message="CUDA is not available") + + # Normalize potential 'mpx' typo to 'mps' + if device == "mpx": + print("Note: device 'mpx' detected, treating it as 'mps'.", file=sys.stderr) + device = "mps" + + # Validate mps availability if requested + if device == "mps" and not torch.backends.mps.is_available(): + print("Warning: MPS not available. Falling back to CPU.", file=sys.stderr) + device = "cpu" + + self.device = device + + options = request.Options + + # empty dict + self.options = {} + + # The options are a list of strings in this form optname:optvalue + # We are storing all the options in a dict so we can use it later when + # generating the audio + for opt in options: + if ":" not in opt: + continue + key, value = opt.split(":", 1) # Split only on first colon + # if value is a number, convert it to the appropriate type + if is_float(value): + value = float(value) + elif is_int(value): + value = int(value) + elif value.lower() in ["true", "false"]: + value = value.lower() == "true" + self.options[key] = value + + # Get model path from request + model_path = request.Model + if not model_path: + model_path = "openbmb/VoxCPM1.5" + + try: + print(f"Loading model from {model_path}", file=sys.stderr) + self.model = VoxCPM.from_pretrained(model_path) + print(f"Model loaded successfully on device: {self.device}", file=sys.stderr) + except Exception as err: + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + return backend_pb2.Result(message="Model loaded successfully", success=True) + + def TTS(self, request, context): + try: + # Get generation parameters from options with defaults + cfg_value = self.options.get("cfg_value", 2.0) + inference_timesteps = self.options.get("inference_timesteps", 10) + normalize = self.options.get("normalize", False) + denoise = self.options.get("denoise", False) + retry_badcase = self.options.get("retry_badcase", True) + retry_badcase_max_times = self.options.get("retry_badcase_max_times", 3) + retry_badcase_ratio_threshold = self.options.get("retry_badcase_ratio_threshold", 6.0) + use_streaming = self.options.get("streaming", False) + + # Handle voice cloning via prompt_wav_path and prompt_text + prompt_wav_path = None + prompt_text = None + + # Priority: request.voice > AudioPath > options + if hasattr(request, 'voice') and request.voice: + # If voice is provided, try to use it as a path + if os.path.exists(request.voice): + prompt_wav_path = request.voice + elif hasattr(request, 'ModelFile') and request.ModelFile: + model_file_base = os.path.dirname(request.ModelFile) + potential_path = os.path.join(model_file_base, request.voice) + if os.path.exists(potential_path): + prompt_wav_path = potential_path + elif hasattr(request, 'ModelPath') and request.ModelPath: + potential_path = os.path.join(request.ModelPath, request.voice) + if os.path.exists(potential_path): + prompt_wav_path = potential_path + + if hasattr(request, 'AudioPath') and request.AudioPath: + if os.path.isabs(request.AudioPath): + prompt_wav_path = request.AudioPath + elif hasattr(request, 'ModelFile') and request.ModelFile: + model_file_base = os.path.dirname(request.ModelFile) + prompt_wav_path = os.path.join(model_file_base, request.AudioPath) + elif hasattr(request, 'ModelPath') and request.ModelPath: + prompt_wav_path = os.path.join(request.ModelPath, request.AudioPath) + else: + prompt_wav_path = request.AudioPath + + # Get prompt_text from options if available + if "prompt_text" in self.options: + prompt_text = self.options["prompt_text"] + + # Prepare text + text = request.text.strip() + + print(f"Generating audio with cfg_value: {cfg_value}, inference_timesteps: {inference_timesteps}, streaming: {use_streaming}", file=sys.stderr) + + # Generate audio + if use_streaming: + # Streaming generation + chunks = [] + for chunk in self.model.generate_streaming( + text=text, + prompt_wav_path=prompt_wav_path, + prompt_text=prompt_text, + cfg_value=cfg_value, + inference_timesteps=inference_timesteps, + normalize=normalize, + denoise=denoise, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + ): + chunks.append(chunk) + wav = np.concatenate(chunks) + else: + # Non-streaming generation + wav = self.model.generate( + text=text, + prompt_wav_path=prompt_wav_path, + prompt_text=prompt_text, + cfg_value=cfg_value, + inference_timesteps=inference_timesteps, + normalize=normalize, + denoise=denoise, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + ) + + # Get sample rate from model + sample_rate = self.model.tts_model.sample_rate + + # Save output + sf.write(request.dst, wav, sample_rate) + print(f"Saved output to {request.dst}", file=sys.stderr) + + except Exception as err: + print(f"Error in TTS: {err}", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + return backend_pb2.Result(success=True) + +def serve(address): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + server.add_insecure_port(address) + server.start() + print("Server started. Listening on: " + address, file=sys.stderr) + + # Define the signal handler function + def signal_handler(sig, frame): + print("Received termination signal. Shutting down...") + server.stop(0) + sys.exit(0) + + # Set the signal handlers for SIGINT and SIGTERM + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + try: + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + server.stop(0) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the gRPC server.") + parser.add_argument( + "--addr", default="localhost:50051", help="The address to bind the server to." + ) + args = parser.parse_args() + + serve(args.addr) diff --git a/backend/python/voxcpm/install.sh b/backend/python/voxcpm/install.sh new file mode 100755 index 000000000..9d167d829 --- /dev/null +++ b/backend/python/voxcpm/install.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +installRequirements + +# Apply patch to fix PyTorch compatibility issue in voxcpm +# This fixes the "Dimension out of range" error in scaled_dot_product_attention +# by changing .contiguous() to .unsqueeze(0) in the attention module +# The patch is needed because voxcpm's initialization test generation fails with +# certain PyTorch versions due to a bug in scaled_dot_product_attention +# https://github.com/OpenBMB/VoxCPM/issues/71#issuecomment-3441789452 +VOXCPM_PATH=$(python -c "import voxcpm; import os; print(os.path.dirname(voxcpm.__file__))" 2>/dev/null || echo "") +if [ -n "$VOXCPM_PATH" ] && [ -f "$VOXCPM_PATH/modules/minicpm4/model.py" ]; then + echo "Applying patch to voxcpm at $VOXCPM_PATH/modules/minicpm4/model.py" + # Replace .contiguous() with .unsqueeze(0) for the three lines in the attention forward_step method + # This fixes the dimension error in scaled_dot_product_attention + sed -i 's/query_states = query_states\.contiguous()/query_states = query_states.unsqueeze(0)/g' "$VOXCPM_PATH/modules/minicpm4/model.py" + sed -i 's/key_cache = key_cache\.contiguous()/key_cache = key_cache.unsqueeze(0)/g' "$VOXCPM_PATH/modules/minicpm4/model.py" + sed -i 's/value_cache = value_cache\.contiguous()/value_cache = value_cache.unsqueeze(0)/g' "$VOXCPM_PATH/modules/minicpm4/model.py" + echo "Patch applied successfully" +else + echo "Warning: Could not find voxcpm installation to apply patch (path: ${VOXCPM_PATH:-not found})" +fi diff --git a/backend/python/voxcpm/protogen.sh b/backend/python/voxcpm/protogen.sh new file mode 100755 index 000000000..df3325c6f --- /dev/null +++ b/backend/python/voxcpm/protogen.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +runProtogen diff --git a/backend/python/voxcpm/requirements-cpu.txt b/backend/python/voxcpm/requirements-cpu.txt new file mode 100644 index 000000000..a6369ef01 --- /dev/null +++ b/backend/python/voxcpm/requirements-cpu.txt @@ -0,0 +1,6 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +torch +soundfile +numpy +voxcpm +torchcodec \ No newline at end of file diff --git a/backend/python/voxcpm/requirements-cublas12.txt b/backend/python/voxcpm/requirements-cublas12.txt new file mode 100644 index 000000000..0482e1408 --- /dev/null +++ b/backend/python/voxcpm/requirements-cublas12.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cu121 +torch +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-cublas13.txt b/backend/python/voxcpm/requirements-cublas13.txt new file mode 100644 index 000000000..a17b28fa7 --- /dev/null +++ b/backend/python/voxcpm/requirements-cublas13.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +torch +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-hipblas.txt b/backend/python/voxcpm/requirements-hipblas.txt new file mode 100644 index 000000000..7541c8149 --- /dev/null +++ b/backend/python/voxcpm/requirements-hipblas.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/rocm6.3 +torch==2.7.1+rocm6.3 +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-intel.txt b/backend/python/voxcpm/requirements-intel.txt new file mode 100644 index 000000000..10269abf6 --- /dev/null +++ b/backend/python/voxcpm/requirements-intel.txt @@ -0,0 +1,6 @@ +--extra-index-url https://download.pytorch.org/whl/xpu +torch +setuptools +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-l4t12.txt b/backend/python/voxcpm/requirements-l4t12.txt new file mode 100644 index 000000000..5967d6fd9 --- /dev/null +++ b/backend/python/voxcpm/requirements-l4t12.txt @@ -0,0 +1,5 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/ +torch +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-l4t13.txt b/backend/python/voxcpm/requirements-l4t13.txt new file mode 100644 index 000000000..a17b28fa7 --- /dev/null +++ b/backend/python/voxcpm/requirements-l4t13.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +torch +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-mps.txt b/backend/python/voxcpm/requirements-mps.txt new file mode 100644 index 000000000..bebe7af62 --- /dev/null +++ b/backend/python/voxcpm/requirements-mps.txt @@ -0,0 +1,4 @@ +torch +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements.txt b/backend/python/voxcpm/requirements.txt new file mode 100644 index 000000000..cc1cd74bd --- /dev/null +++ b/backend/python/voxcpm/requirements.txt @@ -0,0 +1,7 @@ +grpcio==1.76.0 +protobuf +certifi +packaging==24.1 +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/run.sh b/backend/python/voxcpm/run.sh new file mode 100755 index 000000000..eae121f37 --- /dev/null +++ b/backend/python/voxcpm/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +startBackend $@ diff --git a/backend/python/voxcpm/test.py b/backend/python/voxcpm/test.py new file mode 100644 index 000000000..0a94012aa --- /dev/null +++ b/backend/python/voxcpm/test.py @@ -0,0 +1,51 @@ +""" +A test script to test the gRPC service +""" +import unittest +import subprocess +import time +import backend_pb2 +import backend_pb2_grpc + +import grpc + + +class TestBackendServicer(unittest.TestCase): + """ + TestBackendServicer is the class that tests the gRPC service + """ + def setUp(self): + """ + This method sets up the gRPC service by starting the server + """ + self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"]) + time.sleep(30) + + def tearDown(self) -> None: + """ + This method tears down the gRPC service by terminating the server + """ + self.service.terminate() + self.service.wait() + + def test_load_model(self): + """ + This method tests if the model is loaded successfully + """ + try: + self.setUp() + print("Starting test_load_model") + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="openbmb/VoxCPM1.5")) + print(response) + self.assertTrue(response.success) + self.assertEqual(response.message, "Model loaded successfully") + tts_request = backend_pb2.TTSRequest(text="VoxCPM is an innovative end-to-end TTS model from ModelBest.", dst="test.wav") + tts_response = stub.TTS(tts_request) + self.assertIsNotNone(tts_response) + except Exception as err: + print(err) + self.fail("LoadModel service failed") + finally: + self.tearDown() diff --git a/backend/python/voxcpm/test.sh b/backend/python/voxcpm/test.sh new file mode 100755 index 000000000..eb59f2aaf --- /dev/null +++ b/backend/python/voxcpm/test.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +runUnittests diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go index 115a00149..2af25b279 100644 --- a/core/http/middleware/request.go +++ b/core/http/middleware/request.go @@ -491,6 +491,8 @@ func (re *RequestExtractor) SetOpenResponsesRequest(c echo.Context) error { return echo.ErrBadRequest } + // Convert input items to Messages (this will be done in the endpoint handler) + // We store the input in the request for the endpoint to process cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) if !ok || cfg == nil { return echo.ErrBadRequest