feat(tts): add pocket-tts backend (#8018)

* feat(pocket-tts): add new backend

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Add to the gallery

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Update docs

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-01-13 23:35:19 +01:00
committed by GitHub
parent 3a2be4df48
commit a6ff354c86
25 changed files with 847 additions and 17 deletions

View File

@@ -0,0 +1,23 @@
.PHONY: pocket-tts
pocket-tts:
bash install.sh
.PHONY: run
run: pocket-tts
@echo "Running pocket-tts..."
bash run.sh
@echo "pocket-tts run."
.PHONY: test
test: pocket-tts
@echo "Testing pocket-tts..."
bash test.sh
@echo "pocket-tts tested."
.PHONY: protogen-clean
protogen-clean:
$(RM) backend_pb2_grpc.py backend_pb2.py
.PHONY: clean
clean: protogen-clean
rm -rf venv __pycache__

View File

@@ -0,0 +1,255 @@
#!/usr/bin/env python3
"""
This is an extra gRPC server of LocalAI for Pocket TTS
"""
from concurrent import futures
import time
import argparse
import signal
import sys
import os
import traceback
import scipy.io.wavfile
import backend_pb2
import backend_pb2_grpc
import torch
from pocket_tts import TTSModel
import grpc
def is_float(s):
"""Check if a string can be converted to float."""
try:
float(s)
return True
except ValueError:
return False
def is_int(s):
"""Check if a string can be converted to int."""
try:
int(s)
return True
except ValueError:
return False
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
"""
BackendServicer is the class that implements the gRPC service
"""
def Health(self, request, context):
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
# Get device
if torch.cuda.is_available():
print("CUDA is available", file=sys.stderr)
device = "cuda"
else:
print("CUDA is not available", file=sys.stderr)
device = "cpu"
mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
if mps_available:
device = "mps"
if not torch.cuda.is_available() and request.CUDA:
return backend_pb2.Result(success=False, message="CUDA is not available")
# Normalize potential 'mpx' typo to 'mps'
if device == "mpx":
print("Note: device 'mpx' detected, treating it as 'mps'.", file=sys.stderr)
device = "mps"
# Validate mps availability if requested
if device == "mps" and not torch.backends.mps.is_available():
print("Warning: MPS not available. Falling back to CPU.", file=sys.stderr)
device = "cpu"
self.device = device
options = request.Options
# empty dict
self.options = {}
# The options are a list of strings in this form optname:optvalue
# We are storing all the options in a dict so we can use it later when
# generating the audio
for opt in options:
if ":" not in opt:
continue
key, value = opt.split(":", 1) # Split only on first colon
# if value is a number, convert it to the appropriate type
if is_float(value):
value = float(value)
elif is_int(value):
value = int(value)
elif value.lower() in ["true", "false"]:
value = value.lower() == "true"
self.options[key] = value
# Default voice for caching
self.default_voice_url = self.options.get("default_voice", None)
self._voice_cache = {}
try:
print("Loading Pocket TTS model", file=sys.stderr)
self.tts_model = TTSModel.load_model()
print(f"Model loaded successfully. Sample rate: {self.tts_model.sample_rate}", file=sys.stderr)
# Pre-load default voice if specified
if self.default_voice_url:
try:
print(f"Pre-loading default voice: {self.default_voice_url}", file=sys.stderr)
voice_state = self.tts_model.get_state_for_audio_prompt(self.default_voice_url)
self._voice_cache[self.default_voice_url] = voice_state
print("Default voice loaded successfully", file=sys.stderr)
except Exception as e:
print(f"Warning: Failed to pre-load default voice: {e}", file=sys.stderr)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def _get_voice_state(self, voice_input):
"""
Get voice state from cache or load it.
voice_input can be:
- HuggingFace URL (e.g., hf://kyutai/tts-voices/alba-mackenna/casual.wav)
- Local file path
- None (use default)
"""
# Use default if no voice specified
if not voice_input:
voice_input = self.default_voice_url
if not voice_input:
return None
# Check cache first
if voice_input in self._voice_cache:
return self._voice_cache[voice_input]
# Load voice state
try:
print(f"Loading voice from: {voice_input}", file=sys.stderr)
voice_state = self.tts_model.get_state_for_audio_prompt(voice_input)
self._voice_cache[voice_input] = voice_state
return voice_state
except Exception as e:
print(f"Error loading voice from {voice_input}: {e}", file=sys.stderr)
return None
def TTS(self, request, context):
try:
# Determine voice input
# Priority: request.voice > AudioPath (from ModelOptions) > default
voice_input = None
if request.voice:
voice_input = request.voice
elif hasattr(request, 'AudioPath') and request.AudioPath:
# Use AudioPath as voice file
if os.path.isabs(request.AudioPath):
voice_input = request.AudioPath
elif hasattr(request, 'ModelFile') and request.ModelFile:
model_file_base = os.path.dirname(request.ModelFile)
voice_input = os.path.join(model_file_base, request.AudioPath)
elif hasattr(request, 'ModelPath') and request.ModelPath:
voice_input = os.path.join(request.ModelPath, request.AudioPath)
else:
voice_input = request.AudioPath
# Get voice state
voice_state = self._get_voice_state(voice_input)
if voice_state is None:
return backend_pb2.Result(
success=False,
message=f"Voice not found or failed to load: {voice_input}. Please provide a valid voice URL or file path."
)
# Prepare text
text = request.text.strip()
if not text:
return backend_pb2.Result(
success=False,
message="Text is empty"
)
print(f"Generating audio for text: {text[:50]}...", file=sys.stderr)
# Generate audio
audio = self.tts_model.generate_audio(voice_state, text)
# Audio is a 1D torch tensor containing PCM data
if audio is None or audio.numel() == 0:
return backend_pb2.Result(
success=False,
message="No audio generated"
)
# Save audio to file
output_path = request.dst
if not output_path:
output_path = "/tmp/pocket-tts-output.wav"
# Ensure output directory exists
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# Convert torch tensor to numpy and save
audio_numpy = audio.numpy()
scipy.io.wavfile.write(output_path, self.tts_model.sample_rate, audio_numpy)
print(f"Saved audio to {output_path}", file=sys.stderr)
except Exception as err:
print(f"Error in TTS: {err}", file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(success=True)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
serve(args.addr)

View File

@@ -0,0 +1,30 @@
#!/bin/bash
set -e
backend_dir=$(dirname $0)
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
fi
# Use python 3.12 for l4t
if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
PYTHON_VERSION="3.12"
PYTHON_PATCH="12"
PY_STANDALONE_TAG="20251120"
fi
if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
USE_PIP=true
fi
installRequirements

View File

@@ -0,0 +1,11 @@
#!/bin/bash
set -e
backend_dir=$(dirname $0)
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto

View File

@@ -0,0 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/cpu
pocket-tts
scipy
torch

View File

@@ -0,0 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/cu121
pocket-tts
scipy
torch

View File

@@ -0,0 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/cu130
pocket-tts
scipy
torch

View File

@@ -0,0 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.3
pocket-tts
scipy
torch==2.7.1+rocm6.3

View File

@@ -0,0 +1,4 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
pocket-tts
scipy
torch==2.5.1+cxx11.abi

View File

@@ -0,0 +1,4 @@
--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
pocket-tts
scipy
torch

View File

@@ -0,0 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/cu130
pocket-tts
scipy
torch

View File

@@ -0,0 +1,4 @@
pocket-tts
scipy
torch==2.7.1
torchvision==0.22.1

View File

@@ -0,0 +1,4 @@
grpcio==1.71.0
protobuf
certifi
packaging==24.1

View File

@@ -0,0 +1,9 @@
#!/bin/bash
backend_dir=$(dirname $0)
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
startBackend $@

View File

@@ -0,0 +1,141 @@
"""
A test script to test the gRPC service
"""
import unittest
import subprocess
import time
import os
import tempfile
import backend_pb2
import backend_pb2_grpc
import grpc
class TestBackendServicer(unittest.TestCase):
"""
TestBackendServicer is the class that tests the gRPC service
"""
def setUp(self):
"""
This method sets up the gRPC service by starting the server
"""
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
time.sleep(30)
def tearDown(self) -> None:
"""
This method tears down the gRPC service by terminating the server
"""
self.service.terminate()
self.service.wait()
def test_server_startup(self):
"""
This method tests if the server starts up successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
print(err)
self.fail("Server failed to start")
finally:
self.tearDown()
def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions())
print(response)
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()
def test_tts_with_hf_voice(self):
"""
This method tests TTS generation with HuggingFace voice URL
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
# Load model
response = stub.LoadModel(backend_pb2.ModelOptions())
self.assertTrue(response.success)
# Create temporary output file
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
output_path = tmp_file.name
# Test TTS with HuggingFace voice URL
tts_request = backend_pb2.TTSRequest(
text="Hello world, this is a test.",
dst=output_path,
voice="azelma"
)
tts_response = stub.TTS(tts_request)
self.assertTrue(tts_response.success)
# Verify output file exists and is not empty
self.assertTrue(os.path.exists(output_path))
self.assertGreater(os.path.getsize(output_path), 0)
# Cleanup
os.unlink(output_path)
except Exception as err:
print(err)
self.fail("TTS service failed")
finally:
self.tearDown()
def test_tts_with_default_voice(self):
"""
This method tests TTS generation with default voice (via AudioPath in LoadModel)
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
# Load model with default voice
load_request = backend_pb2.ModelOptions(
Options=["default_voice:azelma"]
)
response = stub.LoadModel(load_request)
self.assertTrue(response.success)
# Create temporary output file
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
output_path = tmp_file.name
# Test TTS without specifying voice (should use default)
tts_request = backend_pb2.TTSRequest(
text="Hello world, this is a test.",
dst=output_path
)
tts_response = stub.TTS(tts_request)
self.assertTrue(tts_response.success)
# Verify output file exists and is not empty
self.assertTrue(os.path.exists(output_path))
self.assertGreater(os.path.getsize(output_path), 0)
# Cleanup
os.unlink(output_path)
except Exception as err:
print(err)
self.fail("TTS service with default voice failed")
finally:
self.tearDown()

View File

@@ -0,0 +1,11 @@
#!/bin/bash
set -e
backend_dir=$(dirname $0)
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
runUnittests