feat(tts): support per-request instructions and params (#10172)

The OpenAI-compatible TTS endpoint accepts an `instructions` field, but it was silently dropped at the HTTP->gRPC boundary: neither schema.TTSRequest nor the gRPC TTSRequest proto carried it, so backends could only read such a value from static YAML options (identical for every request). This blocked per-line emotion/style and, for Qwen3-TTS VoiceDesign, limited a model config to a single designed voice. Plumb a generic per-request instruction string end to end, plus an optional backend-specific params map: - proto: add `optional string instructions` and `map<string,string> params` to TTSRequest. - schema: add Instructions (maps OpenAI `instructions`) and Params (LocalAI extension) to schema.TTSRequest. - core: thread both through ModelTTS/ModelTTSStream via a newTTSRequest helper that attaches instructions only when non-empty (so backends can fall back to YAML when unset); forward them from the /v1/audio/speech handler. - qwen-tts: prefer the per-request instruction over the YAML `instruct` option (used by both mode detection and generation) and merge per-request params. - chatterbox: merge per-request params (coerced to float/int/bool) over YAML options into generate() kwargs. Fully backward compatible: empty instructions fall back to the YAML option and backends that don't support style/voice instructions ignore the field. Closes #10164 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-07-30 09:57:57 -04:00 · 2026-06-04 11:45:02 +02:00
parent 55c0911c23
commit 27e63b9a78
15 changed files with 233 additions and 25 deletions
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -537,6 +537,15 @@ message TTSRequest {
  string dst = 3;
  string voice = 4;
  optional string language = 5;
+  // instructions is a free-form, per-request style/voice description (maps to
+  // the OpenAI `instructions` field). Backends that support expressive synthesis
+  // (e.g. Qwen3-TTS CustomVoice/VoiceDesign) prefer this over the static YAML
+  // option when set; backends that don't simply ignore it.
+  optional string instructions = 6;
+  // params carries optional, backend-specific per-request generation parameters
+  // (e.g. Chatterbox exaggeration/cfg_weight/temperature). Values are strings and
+  // coerced by the backend; unset leaves the backend's configured defaults.
+  map<string, string> params = 7;
 }

 message VADRequest {
--- a/backend/python/chatterbox/backend.py
+++ b/backend/python/chatterbox/backend.py
@@ -37,6 +37,20 @@ def is_int(s):
    except ValueError:
        return False

+def coerce_param_value(value):
+    """Coerce a TTSRequest.params value (string on the wire) to the type the
+    Chatterbox generate() kwargs expect (float/int/bool), matching how static
+    YAML options are coerced at load time. Non-string values pass through."""
+    if not isinstance(value, str):
+        return value
+    if is_float(value):
+        return float(value)
+    if is_int(value):
+        return int(value)
+    if value.lower() in ["true", "false"]:
+        return value.lower() == "true"
+    return value
+
 def split_text_at_word_boundary(text, max_length=250):
    """
    Split text at word boundaries without truncating words.
@@ -191,6 +205,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            # add options to kwargs
            kwargs.update(self.options)

+            # Merge per-request params (TTSRequest.params), overriding the static
+            # YAML options. This exposes Chatterbox generation knobs (e.g.
+            # exaggeration, cfg_weight, temperature) per request. Values arrive as
+            # strings on the wire and are coerced to float/int/bool.
+            if hasattr(request, "params") and request.params:
+                for key, value in request.params.items():
+                    kwargs[key] = coerce_param_value(value)
+
            # Check if text exceeds 250 characters
            # (chatterbox does not support long text)
            # https://github.com/resemble-ai/chatterbox/issues/60
--- a/backend/python/qwen-tts/backend.py
+++ b/backend/python/qwen-tts/backend.py
@@ -47,6 +47,26 @@ def is_int(s):
        return False


+def coerce_param_value(value):
+    """Coerce a string param value (from the TTSRequest.params map, which is
+    string-typed on the wire) into the most specific Python type the model
+    generation kwargs expect: bool, int, float, else the original string."""
+    if not isinstance(value, str):
+        return value
+    lowered = value.strip().lower()
+    if lowered in ("true", "false"):
+        return lowered == "true"
+    try:
+        return int(value)
+    except ValueError:
+        pass
+    try:
+        return float(value)
+    except ValueError:
+        pass
+    return value
+
+
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -322,6 +342,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        return backend_pb2.Result(message="Model loaded successfully", success=True)

+    def _effective_instruct(self, request):
+        """Resolve the instruction/style string for this request, preferring the
+        per-request TTSRequest.instructions value and falling back to the static
+        YAML `instruct` option. Empty string means "no instruction"."""
+        req_instruct = (
+            request.instructions
+            if hasattr(request, "instructions") and request.instructions
+            else ""
+        )
+        if req_instruct:
+            return req_instruct
+        return self.options.get("instruct", "") or ""
+
    def _detect_mode(self, request):
        """Detect which mode to use based on request parameters."""
        # Priority: VoiceClone > VoiceDesign > CustomVoice
@@ -338,8 +371,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if self.audio_path or self.voices:
            return "VoiceClone"

-        # VoiceDesign: instruct option is provided
-        if "instruct" in self.options and self.options["instruct"]:
+        # VoiceDesign: instruct provided per-request or via YAML option
+        if self._effective_instruct(request):
            return "VoiceDesign"

        # Default to CustomVoice
@@ -690,10 +723,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if do_sample is not None:
                generation_kwargs["do_sample"] = do_sample

-            instruct = self.options.get("instruct", "")
+            # Prefer the per-request instruction (TTSRequest.instructions) over the
+            # static YAML `instruct` option. This lets clients set a different style
+            # (CustomVoice emotion) or designed voice (VoiceDesign) per request.
+            instruct = self._effective_instruct(request)
            if instruct is not None and instruct != "":
                generation_kwargs["instruct"] = instruct

+            # Merge any per-request backend-specific params (TTSRequest.params).
+            # Values arrive as strings on the wire; coerce to int/float/bool so the
+            # model receives the types it expects. These override YAML-derived kwargs.
+            if hasattr(request, "params") and request.params:
+                for key, value in request.params.items():
+                    generation_kwargs[key] = coerce_param_value(value)
+
            # Generate audio based on mode
            if mode == "VoiceClone":
                # VoiceClone mode
--- a/core/backend/ctx_propagation_test.go
+++ b/core/backend/ctx_propagation_test.go
@@ -123,14 +123,14 @@ var _ = Describe("X-LocalAI-Node ctx propagation contract", func() {
 	})

 	It("ModelTTS forwards the request context to the SmartRouter", func() {
-		_, _, err := backend.ModelTTS(reqCtx, "hello", "", "", loader, appCfg, modelCfg)
+		_, _, err := backend.ModelTTS(reqCtx, "hello", "", "", "", nil, loader, appCfg, modelCfg)
 		Expect(err).To(HaveOccurred())
 		Expect(err.Error()).To(ContainSubstring("router short-circuit (test)"))
 		stampViaRouterCtx()
 	})

 	It("ModelTTSStream forwards the request context to the SmartRouter", func() {
-		err := backend.ModelTTSStream(reqCtx, "hello", "", "", loader, appCfg, modelCfg, func([]byte) error { return nil })
+		err := backend.ModelTTSStream(reqCtx, "hello", "", "", "", nil, loader, appCfg, modelCfg, func([]byte) error { return nil })
 		Expect(err).To(HaveOccurred())
 		Expect(err.Error()).To(ContainSubstring("router short-circuit (test)"))
 		stampViaRouterCtx()
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -20,11 +20,32 @@ import (
 	"github.com/mudler/LocalAI/pkg/utils"
 )

+// newTTSRequest assembles the gRPC TTSRequest from the per-request inputs. The
+// optional instructions string is only attached when non-empty so backends can
+// distinguish "no per-request instruction" (fall back to YAML) from an explicit
+// empty one. params is forwarded as-is (nil when unset).
+func newTTSRequest(text, modelPath, voice, dst, language, instructions string, params map[string]string) *proto.TTSRequest {
+	req := &proto.TTSRequest{
+		Text:     text,
+		Model:    modelPath,
+		Voice:    voice,
+		Dst:      dst,
+		Language: &language,
+		Params:   params,
+	}
+	if instructions != "" {
+		req.Instructions = &instructions
+	}
+	return req
+}
+
 func ModelTTS(
 	ctx context.Context,
 	text,
 	voice,
-	language string,
+	language,
+	instructions string,
+	params map[string]string,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	modelConfig config.ModelConfig,
@@ -74,13 +95,9 @@ func ModelTTS(
 		startTime = time.Now()
 	}

-	res, err := ttsModel.TTS(ctx, &proto.TTSRequest{
-		Text:     text,
-		Model:    modelPath,
-		Voice:    voice,
-		Dst:      filePath,
-		Language: &language,
-	})
+	ttsRequest := newTTSRequest(text, modelPath, voice, filePath, language, instructions, params)
+
+	res, err := ttsModel.TTS(ctx, ttsRequest)

 	if appConfig.EnableTracing {
 		errStr := ""
@@ -128,7 +145,9 @@ func ModelTTSStream(
 	ctx context.Context,
 	text,
 	voice,
-	language string,
+	language,
+	instructions string,
+	params map[string]string,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	modelConfig config.ModelConfig,
@@ -177,12 +196,10 @@ func ModelTTSStream(
 	var totalPCMBytes int
 	snippetCapped := false

-	err = ttsModel.TTSStream(ctx, &proto.TTSRequest{
-		Text:     text,
-		Model:    modelPath,
-		Voice:    voice,
-		Language: &language,
-	}, func(reply *proto.Reply) {
+	// Streaming TTS writes to the HTTP response, not a file, so dst is empty.
+	ttsRequest := newTTSRequest(text, modelPath, voice, "", language, instructions, params)
+
+	err = ttsModel.TTSStream(ctx, ttsRequest, func(reply *proto.Reply) {
 		// First message contains sample rate info
 		if !headerSent && len(reply.Message) > 0 {
 			var info map[string]any
--- a/core/backend/tts_test.go
+++ b/core/backend/tts_test.go
@@ -0,0 +1,42 @@
+package backend
+
+// Specs for the TTSRequest assembly that carries the per-request
+// instructions/params from the OpenAI `instructions` field (and the LocalAI
+// `params` extension) through to the gRPC boundary. Before this plumbing the
+// instruction value was dropped before reaching the backend; these specs pin
+// that it now survives, and that the empty case stays backward compatible.
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("newTTSRequest", func() {
+	It("attaches the instructions when a per-request value is set", func() {
+		req := newTTSRequest("hi", "/m", "alloy", "/out.wav", "en", "cheerful narrator", nil)
+		Expect(req.Instructions).ToNot(BeNil())
+		Expect(req.GetInstructions()).To(Equal("cheerful narrator"))
+		Expect(req.GetText()).To(Equal("hi"))
+		Expect(req.GetVoice()).To(Equal("alloy"))
+		Expect(req.GetDst()).To(Equal("/out.wav"))
+		Expect(req.GetLanguage()).To(Equal("en"))
+	})
+
+	It("leaves instructions unset when empty so backends fall back to YAML", func() {
+		req := newTTSRequest("hi", "/m", "", "/out.wav", "", "", nil)
+		Expect(req.Instructions).To(BeNil())
+		Expect(req.GetInstructions()).To(Equal(""))
+	})
+
+	It("forwards per-request params through to the backend", func() {
+		params := map[string]string{"exaggeration": "0.7", "cfg_weight": "0.3"}
+		req := newTTSRequest("hi", "/m", "", "/out.wav", "", "", params)
+		Expect(req.GetParams()).To(HaveKeyWithValue("exaggeration", "0.7"))
+		Expect(req.GetParams()).To(HaveKeyWithValue("cfg_weight", "0.3"))
+	})
+
+	It("leaves params nil when none are supplied", func() {
+		req := newTTSRequest("hi", "/m", "", "/out.wav", "", "", nil)
+		Expect(req.GetParams()).To(BeNil())
+	})
+})
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -62,7 +62,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 	options.Backend = t.Backend
 	options.Model = t.Model

-	filePath, _, err := backend.ModelTTS(context.Background(), text, t.Voice, t.Language, ml, opts, options)
+	filePath, _, err := backend.ModelTTS(context.Background(), text, t.Voice, t.Language, "", nil, ml, opts, options)
 	if err != nil {
 		return err
 	}
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@@ -37,7 +37,7 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig

 		xlog.Debug("elevenlabs TTS request received", "modelName", input.ModelID)

-		filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Text, voiceID, input.LanguageCode, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Text, voiceID, input.LanguageCode, "", nil, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@@ -59,7 +59,7 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig
 			c.Response().Header().Set("Connection", "keep-alive")

 			// Stream audio chunks as they're generated
-			err := backend.ModelTTSStream(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, ml, appConfig, *cfg, func(audioChunk []byte) error {
+			err := backend.ModelTTSStream(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, input.Instructions, input.Params, ml, appConfig, *cfg, func(audioChunk []byte) error {
 				_, writeErr := c.Response().Write(audioChunk)
 				if writeErr != nil {
 					return writeErr
@@ -75,7 +75,7 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig
 		}

 		// Non-streaming TTS (existing behavior)
-		filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, input.Instructions, input.Params, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -313,7 +313,7 @@ func newRealtimeDecisionID() string {
 }

 func (m *wrappedModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) {
-	return backend.ModelTTS(ctx, text, voice, language, m.modelLoader, m.appConfig, *m.TTSConfig)
+	return backend.ModelTTS(ctx, text, voice, language, "", nil, m.modelLoader, m.appConfig, *m.TTSConfig)
 }

 func (m *wrappedModel) PredictConfig() *config.ModelConfig {
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -60,6 +60,14 @@ type TTSRequest struct {
 	Format   string `json:"response_format,omitempty" yaml:"response_format,omitempty"` // (optional) output format
 	Stream     bool   `json:"stream,omitempty" yaml:"stream,omitempty"`                   // (optional) enable streaming TTS
 	SampleRate int    `json:"sample_rate,omitempty" yaml:"sample_rate,omitempty"`         // (optional) desired output sample rate
+	// Instructions is a free-form, per-request style/voice description. It maps to
+	// the OpenAI `instructions` field and is forwarded to the backend so expressive
+	// TTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed
+	// voice per request instead of only via the static YAML option.
+	Instructions string `json:"instructions,omitempty" yaml:"instructions,omitempty"`
+	// Params carries optional, backend-specific per-request generation parameters
+	// (LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature).
+	Params map[string]string `json:"params,omitempty" yaml:"params,omitempty"`
 }

 // @Description VAD request body
--- a/docs/content/features/text-to-audio.md
+++ b/docs/content/features/text-to-audio.md
@@ -337,6 +337,37 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
   }' | aplay
 ```

+#### Per-request instructions
+
+Instead of (or in addition to) the static YAML `instruct` option, you can pass an
+`instructions` string per request. It maps to the OpenAI
+[`instructions`](https://platform.openai.com/docs/api-reference/audio/createSpeech) field
+and takes precedence over the YAML option when set, falling back to it when empty. This lets
+a single model config serve a different emotion (CustomVoice) or a different designed voice
+(VoiceDesign) on every request - useful for roleplay/narration clients that need many voices:
+
+```
+curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{
+     "model": "qwen-tts-design",
+     "input": "Hello world, this is a test.",
+     "instructions": "A calm, low-pitched elderly storyteller with a warm tone."
+   }' | aplay
+```
+
+Backends that do not support style/voice instructions simply ignore the field.
+
+You can also pass backend-specific generation parameters per request via the LocalAI
+`params` extension (a string-to-string map; values are coerced to the backend's expected
+types). For example, with the Chatterbox backend:
+
+```
+curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{
+     "model": "chatterbox",
+     "input": "Hello world, this is a test.",
+     "params": { "exaggeration": "0.7", "cfg_weight": "0.3", "temperature": "0.8" }
+   }' | aplay
+```
+
 #### Voice Clone Mode

 Voice Clone allows you to clone a voice from reference audio. Configure the model with an `AudioPath` and optional `ref_text`:
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -5897,6 +5897,10 @@ const docTemplate = `{
                    "description": "text input",
                    "type": "string"
                },
+                "instructions": {
+                    "description": "Instructions is a free-form, per-request style/voice description. It maps to\nthe OpenAI ` + "`" + `instructions` + "`" + ` field and is forwarded to the backend so expressive\nTTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed\nvoice per request instead of only via the static YAML option.",
+                    "type": "string"
+                },
                "language": {
                    "description": "(optional) language to use with TTS model",
                    "type": "string"
@@ -5904,6 +5908,13 @@ const docTemplate = `{
                "model": {
                    "type": "string"
                },
+                "params": {
+                    "description": "Params carries optional, backend-specific per-request generation parameters\n(LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature).",
+                    "type": "object",
+                    "additionalProperties": {
+                        "type": "string"
+                    }
+                },
                "response_format": {
                    "description": "(optional) output format",
                    "type": "string"
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -5894,6 +5894,10 @@
                    "description": "text input",
                    "type": "string"
                },
+                "instructions": {
+                    "description": "Instructions is a free-form, per-request style/voice description. It maps to\nthe OpenAI `instructions` field and is forwarded to the backend so expressive\nTTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed\nvoice per request instead of only via the static YAML option.",
+                    "type": "string"
+                },
                "language": {
                    "description": "(optional) language to use with TTS model",
                    "type": "string"
@@ -5901,6 +5905,13 @@
                "model": {
                    "type": "string"
                },
+                "params": {
+                    "description": "Params carries optional, backend-specific per-request generation parameters\n(LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature).",
+                    "type": "object",
+                    "additionalProperties": {
+                        "type": "string"
+                    }
+                },
                "response_format": {
                    "description": "(optional) output format",
                    "type": "string"
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -1996,11 +1996,25 @@ definitions:
      input:
        description: text input
        type: string
+      instructions:
+        description: |-
+          Instructions is a free-form, per-request style/voice description. It maps to
+          the OpenAI `instructions` field and is forwarded to the backend so expressive
+          TTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed
+          voice per request instead of only via the static YAML option.
+        type: string
      language:
        description: (optional) language to use with TTS model
        type: string
      model:
        type: string
+      params:
+        additionalProperties:
+          type: string
+        description: |-
+          Params carries optional, backend-specific per-request generation parameters
+          (LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature).
+        type: object
      response_format:
        description: (optional) output format
        type: string