feat(tts): support per-request instructions and params (#10172)

The OpenAI-compatible TTS endpoint accepts an `instructions` field, but it
was silently dropped at the HTTP->gRPC boundary: neither schema.TTSRequest
nor the gRPC TTSRequest proto carried it, so backends could only read such a
value from static YAML options (identical for every request). This blocked
per-line emotion/style and, for Qwen3-TTS VoiceDesign, limited a model config
to a single designed voice.

Plumb a generic per-request instruction string end to end, plus an optional
backend-specific params map:

- proto: add `optional string instructions` and `map<string,string> params`
  to TTSRequest.
- schema: add Instructions (maps OpenAI `instructions`) and Params (LocalAI
  extension) to schema.TTSRequest.
- core: thread both through ModelTTS/ModelTTSStream via a newTTSRequest helper
  that attaches instructions only when non-empty (so backends can fall back to
  YAML when unset); forward them from the /v1/audio/speech handler.
- qwen-tts: prefer the per-request instruction over the YAML `instruct` option
  (used by both mode detection and generation) and merge per-request params.
- chatterbox: merge per-request params (coerced to float/int/bool) over YAML
  options into generate() kwargs.

Fully backward compatible: empty instructions fall back to the YAML option and
backends that don't support style/voice instructions ignore the field.

Closes #10164


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
LocalAI [bot]
2026-06-04 11:45:02 +02:00
committed by GitHub
parent 55c0911c23
commit 27e63b9a78
15 changed files with 233 additions and 25 deletions

View File

@@ -37,6 +37,20 @@ def is_int(s):
except ValueError:
return False
def coerce_param_value(value):
"""Coerce a TTSRequest.params value (string on the wire) to the type the
Chatterbox generate() kwargs expect (float/int/bool), matching how static
YAML options are coerced at load time. Non-string values pass through."""
if not isinstance(value, str):
return value
if is_float(value):
return float(value)
if is_int(value):
return int(value)
if value.lower() in ["true", "false"]:
return value.lower() == "true"
return value
def split_text_at_word_boundary(text, max_length=250):
"""
Split text at word boundaries without truncating words.
@@ -191,6 +205,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# add options to kwargs
kwargs.update(self.options)
# Merge per-request params (TTSRequest.params), overriding the static
# YAML options. This exposes Chatterbox generation knobs (e.g.
# exaggeration, cfg_weight, temperature) per request. Values arrive as
# strings on the wire and are coerced to float/int/bool.
if hasattr(request, "params") and request.params:
for key, value in request.params.items():
kwargs[key] = coerce_param_value(value)
# Check if text exceeds 250 characters
# (chatterbox does not support long text)
# https://github.com/resemble-ai/chatterbox/issues/60

View File

@@ -47,6 +47,26 @@ def is_int(s):
return False
def coerce_param_value(value):
"""Coerce a string param value (from the TTSRequest.params map, which is
string-typed on the wire) into the most specific Python type the model
generation kwargs expect: bool, int, float, else the original string."""
if not isinstance(value, str):
return value
lowered = value.strip().lower()
if lowered in ("true", "false"):
return lowered == "true"
try:
return int(value)
except ValueError:
pass
try:
return float(value)
except ValueError:
pass
return value
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -322,6 +342,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.Result(message="Model loaded successfully", success=True)
def _effective_instruct(self, request):
"""Resolve the instruction/style string for this request, preferring the
per-request TTSRequest.instructions value and falling back to the static
YAML `instruct` option. Empty string means "no instruction"."""
req_instruct = (
request.instructions
if hasattr(request, "instructions") and request.instructions
else ""
)
if req_instruct:
return req_instruct
return self.options.get("instruct", "") or ""
def _detect_mode(self, request):
"""Detect which mode to use based on request parameters."""
# Priority: VoiceClone > VoiceDesign > CustomVoice
@@ -338,8 +371,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if self.audio_path or self.voices:
return "VoiceClone"
# VoiceDesign: instruct option is provided
if "instruct" in self.options and self.options["instruct"]:
# VoiceDesign: instruct provided per-request or via YAML option
if self._effective_instruct(request):
return "VoiceDesign"
# Default to CustomVoice
@@ -690,10 +723,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if do_sample is not None:
generation_kwargs["do_sample"] = do_sample
instruct = self.options.get("instruct", "")
# Prefer the per-request instruction (TTSRequest.instructions) over the
# static YAML `instruct` option. This lets clients set a different style
# (CustomVoice emotion) or designed voice (VoiceDesign) per request.
instruct = self._effective_instruct(request)
if instruct is not None and instruct != "":
generation_kwargs["instruct"] = instruct
# Merge any per-request backend-specific params (TTSRequest.params).
# Values arrive as strings on the wire; coerce to int/float/bool so the
# model receives the types it expects. These override YAML-derived kwargs.
if hasattr(request, "params") and request.params:
for key, value in request.params.items():
generation_kwargs[key] = coerce_param_value(value)
# Generate audio based on mode
if mode == "VoiceClone":
# VoiceClone mode