feat(tts): add pocket-tts backend (#8018)

* feat(pocket-tts): add new backend Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add to the gallery Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Update docs Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-25 01:02:05 -04:00 · 2026-01-13 23:35:19 +01:00
parent 3a2be4df48
commit a6ff354c86
25 changed files with 847 additions and 17 deletions
--- a/backend/python/pocket-tts/Makefile
+++ b/backend/python/pocket-tts/Makefile
@@ -0,0 +1,23 @@
+.PHONY: pocket-tts
+pocket-tts:
+	bash install.sh
+
+.PHONY: run
+run: pocket-tts
+	@echo "Running pocket-tts..."
+	bash run.sh
+	@echo "pocket-tts run."
+
+.PHONY: test
+test: pocket-tts
+	@echo "Testing pocket-tts..."
+	bash test.sh
+	@echo "pocket-tts tested."
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
--- a/backend/python/pocket-tts/backend.py
+++ b/backend/python/pocket-tts/backend.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""
+This is an extra gRPC server of LocalAI for Pocket TTS
+"""
+from concurrent import futures
+import time
+import argparse
+import signal
+import sys
+import os
+import traceback
+import scipy.io.wavfile
+import backend_pb2
+import backend_pb2_grpc
+import torch
+from pocket_tts import TTSModel
+
+import grpc
+
+def is_float(s):
+    """Check if a string can be converted to float."""
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+def is_int(s):
+    """Check if a string can be converted to int."""
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    BackendServicer is the class that implements the gRPC service
+    """
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    
+    def LoadModel(self, request, context):
+        # Get device
+        if torch.cuda.is_available():
+            print("CUDA is available", file=sys.stderr)
+            device = "cuda"
+        else:
+            print("CUDA is not available", file=sys.stderr)
+            device = "cpu"
+        mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        if mps_available:
+            device = "mps"
+        if not torch.cuda.is_available() and request.CUDA:
+            return backend_pb2.Result(success=False, message="CUDA is not available")
+
+        # Normalize potential 'mpx' typo to 'mps'
+        if device == "mpx":
+            print("Note: device 'mpx' detected, treating it as 'mps'.", file=sys.stderr)
+            device = "mps"
+        
+        # Validate mps availability if requested
+        if device == "mps" and not torch.backends.mps.is_available():
+            print("Warning: MPS not available. Falling back to CPU.", file=sys.stderr)
+            device = "cpu"
+
+        self.device = device
+
+        options = request.Options
+
+        # empty dict
+        self.options = {}
+
+        # The options are a list of strings in this form optname:optvalue
+        # We are storing all the options in a dict so we can use it later when
+        # generating the audio
+        for opt in options:
+            if ":" not in opt:
+                continue
+            key, value = opt.split(":", 1)  # Split only on first colon
+            # if value is a number, convert it to the appropriate type
+            if is_float(value):
+                value = float(value)
+            elif is_int(value):
+                value = int(value)
+            elif value.lower() in ["true", "false"]:
+                value = value.lower() == "true"
+            self.options[key] = value
+
+        # Default voice for caching
+        self.default_voice_url = self.options.get("default_voice", None)
+        self._voice_cache = {}
+
+        try:
+            print("Loading Pocket TTS model", file=sys.stderr)
+            self.tts_model = TTSModel.load_model()
+            print(f"Model loaded successfully. Sample rate: {self.tts_model.sample_rate}", file=sys.stderr)
+
+            # Pre-load default voice if specified
+            if self.default_voice_url:
+                try:
+                    print(f"Pre-loading default voice: {self.default_voice_url}", file=sys.stderr)
+                    voice_state = self.tts_model.get_state_for_audio_prompt(self.default_voice_url)
+                    self._voice_cache[self.default_voice_url] = voice_state
+                    print("Default voice loaded successfully", file=sys.stderr)
+                except Exception as e:
+                    print(f"Warning: Failed to pre-load default voice: {e}", file=sys.stderr)
+
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def _get_voice_state(self, voice_input):
+        """
+        Get voice state from cache or load it.
+        voice_input can be:
+        - HuggingFace URL (e.g., hf://kyutai/tts-voices/alba-mackenna/casual.wav)
+        - Local file path
+        - None (use default)
+        """
+        # Use default if no voice specified
+        if not voice_input:
+            voice_input = self.default_voice_url
+
+        if not voice_input:
+            return None
+
+        # Check cache first
+        if voice_input in self._voice_cache:
+            return self._voice_cache[voice_input]
+
+        # Load voice state
+        try:
+            print(f"Loading voice from: {voice_input}", file=sys.stderr)
+            voice_state = self.tts_model.get_state_for_audio_prompt(voice_input)
+            self._voice_cache[voice_input] = voice_state
+            return voice_state
+        except Exception as e:
+            print(f"Error loading voice from {voice_input}: {e}", file=sys.stderr)
+            return None
+
+    def TTS(self, request, context):
+        try:
+            # Determine voice input
+            # Priority: request.voice > AudioPath (from ModelOptions) > default
+            voice_input = None
+            
+            if request.voice:
+                voice_input = request.voice
+            elif hasattr(request, 'AudioPath') and request.AudioPath:
+                # Use AudioPath as voice file
+                if os.path.isabs(request.AudioPath):
+                    voice_input = request.AudioPath
+                elif hasattr(request, 'ModelFile') and request.ModelFile:
+                    model_file_base = os.path.dirname(request.ModelFile)
+                    voice_input = os.path.join(model_file_base, request.AudioPath)
+                elif hasattr(request, 'ModelPath') and request.ModelPath:
+                    voice_input = os.path.join(request.ModelPath, request.AudioPath)
+                else:
+                    voice_input = request.AudioPath
+
+            # Get voice state
+            voice_state = self._get_voice_state(voice_input)
+            if voice_state is None:
+                return backend_pb2.Result(
+                    success=False,
+                    message=f"Voice not found or failed to load: {voice_input}. Please provide a valid voice URL or file path."
+                )
+
+            # Prepare text
+            text = request.text.strip()
+
+            if not text:
+                return backend_pb2.Result(
+                    success=False,
+                    message="Text is empty"
+                )
+
+            print(f"Generating audio for text: {text[:50]}...", file=sys.stderr)
+
+            # Generate audio
+            audio = self.tts_model.generate_audio(voice_state, text)
+
+            # Audio is a 1D torch tensor containing PCM data
+            if audio is None or audio.numel() == 0:
+                return backend_pb2.Result(
+                    success=False,
+                    message="No audio generated"
+                )
+
+            # Save audio to file
+            output_path = request.dst
+            if not output_path:
+                output_path = "/tmp/pocket-tts-output.wav"
+
+            # Ensure output directory exists
+            output_dir = os.path.dirname(output_path)
+            if output_dir and not os.path.exists(output_dir):
+                os.makedirs(output_dir, exist_ok=True)
+
+            # Convert torch tensor to numpy and save
+            audio_numpy = audio.numpy()
+            scipy.io.wavfile.write(output_path, self.tts_model.sample_rate, audio_numpy)
+            print(f"Saved audio to {output_path}", file=sys.stderr)
+
+        except Exception as err:
+            print(f"Error in TTS: {err}", file=sys.stderr)
+            print(traceback.format_exc(), file=sys.stderr)
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        
+        return backend_pb2.Result(success=True)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/backend/python/pocket-tts/install.sh
+++ b/backend/python/pocket-tts/install.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
+# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
+# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
+# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+if [ "x${BUILD_PROFILE}" == "xintel" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+fi
+
+# Use python 3.12 for l4t
+if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+  PYTHON_VERSION="3.12"
+  PYTHON_PATCH="12"
+  PY_STANDALONE_TAG="20251120"
+fi
+
+if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
+    USE_PIP=true
+fi
+
+installRequirements
--- a/backend/python/pocket-tts/protogen.sh
+++ b/backend/python/pocket-tts/protogen.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/pocket-tts/requirements-cpu.txt
+++ b/backend/python/pocket-tts/requirements-cpu.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
+pocket-tts
+scipy
+torch
--- a/backend/python/pocket-tts/requirements-cublas12.txt
+++ b/backend/python/pocket-tts/requirements-cublas12.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/cu121
+pocket-tts
+scipy
+torch
--- a/backend/python/pocket-tts/requirements-cublas13.txt
+++ b/backend/python/pocket-tts/requirements-cublas13.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/cu130
+pocket-tts
+scipy
+torch
--- a/backend/python/pocket-tts/requirements-hipblas.txt
+++ b/backend/python/pocket-tts/requirements-hipblas.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.3
+pocket-tts
+scipy
+torch==2.7.1+rocm6.3
--- a/backend/python/pocket-tts/requirements-intel.txt
+++ b/backend/python/pocket-tts/requirements-intel.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+pocket-tts
+scipy
+torch==2.5.1+cxx11.abi
--- a/backend/python/pocket-tts/requirements-l4t12.txt
+++ b/backend/python/pocket-tts/requirements-l4t12.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
+pocket-tts
+scipy
+torch
--- a/backend/python/pocket-tts/requirements-l4t13.txt
+++ b/backend/python/pocket-tts/requirements-l4t13.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/cu130
+pocket-tts
+scipy
+torch
--- a/backend/python/pocket-tts/requirements-mps.txt
+++ b/backend/python/pocket-tts/requirements-mps.txt
@@ -0,0 +1,4 @@
+pocket-tts
+scipy
+torch==2.7.1
+torchvision==0.22.1
--- a/backend/python/pocket-tts/requirements.txt
+++ b/backend/python/pocket-tts/requirements.txt
@@ -0,0 +1,4 @@
+grpcio==1.71.0
+protobuf
+certifi
+packaging==24.1
--- a/backend/python/pocket-tts/run.sh
+++ b/backend/python/pocket-tts/run.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+startBackend $@
--- a/backend/python/pocket-tts/test.py
+++ b/backend/python/pocket-tts/test.py
@@ -0,0 +1,141 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import os
+import tempfile
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
+        time.sleep(30)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions())
+                print(response)
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_tts_with_hf_voice(self):
+        """
+        This method tests TTS generation with HuggingFace voice URL
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                # Load model
+                response = stub.LoadModel(backend_pb2.ModelOptions())
+                self.assertTrue(response.success)
+                
+                # Create temporary output file
+                with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
+                    output_path = tmp_file.name
+                
+                # Test TTS with HuggingFace voice URL
+                tts_request = backend_pb2.TTSRequest(
+                    text="Hello world, this is a test.",
+                    dst=output_path,
+                    voice="azelma"
+                )
+                tts_response = stub.TTS(tts_request)
+                self.assertTrue(tts_response.success)
+                
+                # Verify output file exists and is not empty
+                self.assertTrue(os.path.exists(output_path))
+                self.assertGreater(os.path.getsize(output_path), 0)
+                
+                # Cleanup
+                os.unlink(output_path)
+        except Exception as err:
+            print(err)
+            self.fail("TTS service failed")
+        finally:
+            self.tearDown()
+
+    def test_tts_with_default_voice(self):
+        """
+        This method tests TTS generation with default voice (via AudioPath in LoadModel)
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                # Load model with default voice
+                load_request = backend_pb2.ModelOptions(
+                    Options=["default_voice:azelma"]
+                )
+                response = stub.LoadModel(load_request)
+                self.assertTrue(response.success)
+                
+                # Create temporary output file
+                with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
+                    output_path = tmp_file.name
+                
+                # Test TTS without specifying voice (should use default)
+                tts_request = backend_pb2.TTSRequest(
+                    text="Hello world, this is a test.",
+                    dst=output_path
+                )
+                tts_response = stub.TTS(tts_request)
+                self.assertTrue(tts_response.success)
+                
+                # Verify output file exists and is not empty
+                self.assertTrue(os.path.exists(output_path))
+                self.assertGreater(os.path.getsize(output_path), 0)
+                
+                # Cleanup
+                os.unlink(output_path)
+        except Exception as err:
+            print(err)
+            self.fail("TTS service with default voice failed")
+        finally:
+            self.tearDown()
--- a/backend/python/pocket-tts/test.sh
+++ b/backend/python/pocket-tts/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+runUnittests