feat(tts): support per-request instructions and params (#10172)

The OpenAI-compatible TTS endpoint accepts an `instructions` field, but it was silently dropped at the HTTP->gRPC boundary: neither schema.TTSRequest nor the gRPC TTSRequest proto carried it, so backends could only read such a value from static YAML options (identical for every request). This blocked per-line emotion/style and, for Qwen3-TTS VoiceDesign, limited a model config to a single designed voice. Plumb a generic per-request instruction string end to end, plus an optional backend-specific params map: - proto: add `optional string instructions` and `map<string,string> params` to TTSRequest. - schema: add Instructions (maps OpenAI `instructions`) and Params (LocalAI extension) to schema.TTSRequest. - core: thread both through ModelTTS/ModelTTSStream via a newTTSRequest helper that attaches instructions only when non-empty (so backends can fall back to YAML when unset); forward them from the /v1/audio/speech handler. - qwen-tts: prefer the per-request instruction over the YAML `instruct` option (used by both mode detection and generation) and merge per-request params. - chatterbox: merge per-request params (coerced to float/int/bool) over YAML options into generate() kwargs. Fully backward compatible: empty instructions fall back to the YAML option and backends that don't support style/voice instructions ignore the field. Closes #10164 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-08-02 11:30:44 -04:00 · 2026-06-04 11:45:02 +02:00
parent 55c0911c23
commit 27e63b9a78
15 changed files with 233 additions and 25 deletions
--- a/backend/python/chatterbox/backend.py
+++ b/backend/python/chatterbox/backend.py
@@ -37,6 +37,20 @@ def is_int(s):
    except ValueError:
        return False

+def coerce_param_value(value):
+    """Coerce a TTSRequest.params value (string on the wire) to the type the
+    Chatterbox generate() kwargs expect (float/int/bool), matching how static
+    YAML options are coerced at load time. Non-string values pass through."""
+    if not isinstance(value, str):
+        return value
+    if is_float(value):
+        return float(value)
+    if is_int(value):
+        return int(value)
+    if value.lower() in ["true", "false"]:
+        return value.lower() == "true"
+    return value
+
 def split_text_at_word_boundary(text, max_length=250):
    """
    Split text at word boundaries without truncating words.
@@ -191,6 +205,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            # add options to kwargs
            kwargs.update(self.options)

+            # Merge per-request params (TTSRequest.params), overriding the static
+            # YAML options. This exposes Chatterbox generation knobs (e.g.
+            # exaggeration, cfg_weight, temperature) per request. Values arrive as
+            # strings on the wire and are coerced to float/int/bool.
+            if hasattr(request, "params") and request.params:
+                for key, value in request.params.items():
+                    kwargs[key] = coerce_param_value(value)
+
            # Check if text exceeds 250 characters
            # (chatterbox does not support long text)
            # https://github.com/resemble-ai/chatterbox/issues/60
--- a/backend/python/qwen-tts/backend.py
+++ b/backend/python/qwen-tts/backend.py
@@ -47,6 +47,26 @@ def is_int(s):
        return False


+def coerce_param_value(value):
+    """Coerce a string param value (from the TTSRequest.params map, which is
+    string-typed on the wire) into the most specific Python type the model
+    generation kwargs expect: bool, int, float, else the original string."""
+    if not isinstance(value, str):
+        return value
+    lowered = value.strip().lower()
+    if lowered in ("true", "false"):
+        return lowered == "true"
+    try:
+        return int(value)
+    except ValueError:
+        pass
+    try:
+        return float(value)
+    except ValueError:
+        pass
+    return value
+
+
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -322,6 +342,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        return backend_pb2.Result(message="Model loaded successfully", success=True)

+    def _effective_instruct(self, request):
+        """Resolve the instruction/style string for this request, preferring the
+        per-request TTSRequest.instructions value and falling back to the static
+        YAML `instruct` option. Empty string means "no instruction"."""
+        req_instruct = (
+            request.instructions
+            if hasattr(request, "instructions") and request.instructions
+            else ""
+        )
+        if req_instruct:
+            return req_instruct
+        return self.options.get("instruct", "") or ""
+
    def _detect_mode(self, request):
        """Detect which mode to use based on request parameters."""
        # Priority: VoiceClone > VoiceDesign > CustomVoice
@@ -338,8 +371,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if self.audio_path or self.voices:
            return "VoiceClone"

-        # VoiceDesign: instruct option is provided
-        if "instruct" in self.options and self.options["instruct"]:
+        # VoiceDesign: instruct provided per-request or via YAML option
+        if self._effective_instruct(request):
            return "VoiceDesign"

        # Default to CustomVoice
@@ -690,10 +723,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if do_sample is not None:
                generation_kwargs["do_sample"] = do_sample

-            instruct = self.options.get("instruct", "")
+            # Prefer the per-request instruction (TTSRequest.instructions) over the
+            # static YAML `instruct` option. This lets clients set a different style
+            # (CustomVoice emotion) or designed voice (VoiceDesign) per request.
+            instruct = self._effective_instruct(request)
            if instruct is not None and instruct != "":
                generation_kwargs["instruct"] = instruct

+            # Merge any per-request backend-specific params (TTSRequest.params).
+            # Values arrive as strings on the wire; coerce to int/float/bool so the
+            # model receives the types it expects. These override YAML-derived kwargs.
+            if hasattr(request, "params") and request.params:
+                for key, value in request.params.items():
+                    generation_kwargs[key] = coerce_param_value(value)
+
            # Generate audio based on mode
            if mode == "VoiceClone":
                # VoiceClone mode