feat(chatterbox): support multilingual (#6240)

* feat(chatterbox): support multilingual

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Add l4t support

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix: switch to fork

Until https://github.com/resemble-ai/chatterbox/pull/295 is merged

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2025-09-24 18:37:37 +02:00
committed by GitHub
parent b5efc4f89e
commit c85d559919
11 changed files with 107 additions and 21 deletions

View File

@@ -955,6 +955,18 @@ jobs:
backend: "exllama2"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'true'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-arm64-chatterbox'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
backend: "chatterbox"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
# runs out of space on the runner
# - build-type: 'hipblas'
# cuda-major-version: ""

View File

@@ -429,6 +429,9 @@ docker-build-kitten-tts:
docker-save-kitten-tts: backend-images
docker save local-ai-backend:kitten-tts -o backend-images/kitten-tts.tar
docker-save-chatterbox: backend-images
docker save local-ai-backend:chatterbox -o backend-images/chatterbox.tar
docker-build-kokoro:
docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend

View File

@@ -353,6 +353,7 @@
nvidia: "cuda12-chatterbox"
metal: "metal-chatterbox"
default: "cpu-chatterbox"
nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
- &piper
name: "piper"
uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
@@ -1239,6 +1240,7 @@
nvidia: "cuda12-chatterbox-development"
metal: "metal-chatterbox-development"
default: "cpu-chatterbox-development"
nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
- !!merge <<: *chatterbox
name: "cpu-chatterbox"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-chatterbox"
@@ -1249,6 +1251,16 @@
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-chatterbox"
mirrors:
- localai/localai-backends:master-cpu-chatterbox
- !!merge <<: *chatterbox
name: "nvidia-l4t-arm64-chatterbox"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox
- !!merge <<: *chatterbox
name: "nvidia-l4t-arm64-chatterbox-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-arm64-chatterbox"
mirrors:
- localai/localai-backends:master-gpu-nvidia-l4t-arm64-chatterbox
- !!merge <<: *chatterbox
name: "metal-chatterbox"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-chatterbox"

View File

@@ -14,9 +14,23 @@ import backend_pb2_grpc
import torch
import torchaudio as ta
from chatterbox.tts import ChatterboxTTS
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
import grpc
def is_float(s):
"""Check if a string can be converted to float."""
try:
float(s)
return True
except ValueError:
return False
def is_int(s):
"""Check if a string can be converted to int."""
try:
int(s)
return True
except ValueError:
return False
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -47,6 +61,28 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if not torch.cuda.is_available() and request.CUDA:
return backend_pb2.Result(success=False, message="CUDA is not available")
options = request.Options
# empty dict
self.options = {}
# The options are a list of strings in this form optname:optvalue
# We are storing all the options in a dict so we can use it later when
# generating the images
for opt in options:
if ":" not in opt:
continue
key, value = opt.split(":")
# if value is a number, convert it to the appropriate type
if is_float(value):
value = float(value)
elif is_int(value):
value = int(value)
elif value.lower() in ["true", "false"]:
value = value.lower() == "true"
self.options[key] = value
self.AudioPath = None
if os.path.isabs(request.AudioPath):
@@ -56,10 +92,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
modelFileBase = os.path.dirname(request.ModelFile)
# modify LoraAdapter to be relative to modelFileBase
self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
try:
print("Preparing models, please wait", file=sys.stderr)
self.model = ChatterboxTTS.from_pretrained(device=device)
if "multilingual" in self.options:
# remove key from options
del self.options["multilingual"]
self.model = ChatterboxMultilingualTTS.from_pretrained(device=device)
else:
self.model = ChatterboxTTS.from_pretrained(device=device)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
# Implement your logic here for the LoadModel service
@@ -68,12 +108,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
def TTS(self, request, context):
try:
# Generate audio using ChatterboxTTS
kwargs = {}
if "language" in self.options:
kwargs["language_id"] = self.options["language"]
if self.AudioPath is not None:
wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath)
else:
wav = self.model.generate(request.text)
kwargs["audio_prompt_path"] = self.AudioPath
# add options to kwargs
kwargs.update(self.options)
# Generate audio using ChatterboxTTS
wav = self.model.generate(request.text, **kwargs)
# Save the generated audio
ta.save(request.dst, wav, self.model.sr)

View File

@@ -15,5 +15,6 @@ fi
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
fi
EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"
installRequirements

View File

@@ -1,6 +1,8 @@
--extra-index-url https://download.pytorch.org/whl/cpu
accelerate
torch==2.6.0
torchaudio==2.6.0
transformers==4.46.3
chatterbox-tts==0.1.2
torch
torchaudio
transformers
# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
#chatterbox-tts==0.1.4

View File

@@ -2,5 +2,6 @@
torch==2.6.0+cu118
torchaudio==2.6.0+cu118
transformers==4.46.3
chatterbox-tts==0.1.2
# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
accelerate

View File

@@ -1,5 +1,6 @@
torch==2.6.0
torchaudio==2.6.0
transformers==4.46.3
chatterbox-tts==0.1.2
torch
torchaudio
transformers
# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
accelerate

View File

@@ -1,6 +1,7 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.6.0+rocm6.1
torchaudio==2.6.0+rocm6.1
transformers==4.46.3
chatterbox-tts==0.1.2
transformers
# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
accelerate

View File

@@ -2,8 +2,9 @@
intel-extension-for-pytorch==2.3.110+xpu
torch==2.3.1+cxx11.abi
torchaudio==2.3.1+cxx11.abi
transformers==4.46.3
chatterbox-tts==0.1.2
transformers
# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
accelerate
oneccl_bind_pt==2.3.100+xpu
optimum[openvino]

View File

@@ -0,0 +1,6 @@
--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126/
torch
torchaudio
transformers
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
accelerate