feat(tts): support per-request instructions and params (#10172)

The OpenAI-compatible TTS endpoint accepts an `instructions` field, but it
was silently dropped at the HTTP->gRPC boundary: neither schema.TTSRequest
nor the gRPC TTSRequest proto carried it, so backends could only read such a
value from static YAML options (identical for every request). This blocked
per-line emotion/style and, for Qwen3-TTS VoiceDesign, limited a model config
to a single designed voice.

Plumb a generic per-request instruction string end to end, plus an optional
backend-specific params map:

- proto: add `optional string instructions` and `map<string,string> params`
  to TTSRequest.
- schema: add Instructions (maps OpenAI `instructions`) and Params (LocalAI
  extension) to schema.TTSRequest.
- core: thread both through ModelTTS/ModelTTSStream via a newTTSRequest helper
  that attaches instructions only when non-empty (so backends can fall back to
  YAML when unset); forward them from the /v1/audio/speech handler.
- qwen-tts: prefer the per-request instruction over the YAML `instruct` option
  (used by both mode detection and generation) and merge per-request params.
- chatterbox: merge per-request params (coerced to float/int/bool) over YAML
  options into generate() kwargs.

Fully backward compatible: empty instructions fall back to the YAML option and
backends that don't support style/voice instructions ignore the field.

Closes #10164


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
LocalAI [bot]
2026-06-04 11:45:02 +02:00
committed by GitHub
parent 55c0911c23
commit 27e63b9a78
15 changed files with 233 additions and 25 deletions

View File

@@ -537,6 +537,15 @@ message TTSRequest {
string dst = 3;
string voice = 4;
optional string language = 5;
// instructions is a free-form, per-request style/voice description (maps to
// the OpenAI `instructions` field). Backends that support expressive synthesis
// (e.g. Qwen3-TTS CustomVoice/VoiceDesign) prefer this over the static YAML
// option when set; backends that don't simply ignore it.
optional string instructions = 6;
// params carries optional, backend-specific per-request generation parameters
// (e.g. Chatterbox exaggeration/cfg_weight/temperature). Values are strings and
// coerced by the backend; unset leaves the backend's configured defaults.
map<string, string> params = 7;
}
message VADRequest {

View File

@@ -37,6 +37,20 @@ def is_int(s):
except ValueError:
return False
def coerce_param_value(value):
"""Coerce a TTSRequest.params value (string on the wire) to the type the
Chatterbox generate() kwargs expect (float/int/bool), matching how static
YAML options are coerced at load time. Non-string values pass through."""
if not isinstance(value, str):
return value
if is_float(value):
return float(value)
if is_int(value):
return int(value)
if value.lower() in ["true", "false"]:
return value.lower() == "true"
return value
def split_text_at_word_boundary(text, max_length=250):
"""
Split text at word boundaries without truncating words.
@@ -191,6 +205,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# add options to kwargs
kwargs.update(self.options)
# Merge per-request params (TTSRequest.params), overriding the static
# YAML options. This exposes Chatterbox generation knobs (e.g.
# exaggeration, cfg_weight, temperature) per request. Values arrive as
# strings on the wire and are coerced to float/int/bool.
if hasattr(request, "params") and request.params:
for key, value in request.params.items():
kwargs[key] = coerce_param_value(value)
# Check if text exceeds 250 characters
# (chatterbox does not support long text)
# https://github.com/resemble-ai/chatterbox/issues/60

View File

@@ -47,6 +47,26 @@ def is_int(s):
return False
def coerce_param_value(value):
"""Coerce a string param value (from the TTSRequest.params map, which is
string-typed on the wire) into the most specific Python type the model
generation kwargs expect: bool, int, float, else the original string."""
if not isinstance(value, str):
return value
lowered = value.strip().lower()
if lowered in ("true", "false"):
return lowered == "true"
try:
return int(value)
except ValueError:
pass
try:
return float(value)
except ValueError:
pass
return value
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -322,6 +342,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.Result(message="Model loaded successfully", success=True)
def _effective_instruct(self, request):
"""Resolve the instruction/style string for this request, preferring the
per-request TTSRequest.instructions value and falling back to the static
YAML `instruct` option. Empty string means "no instruction"."""
req_instruct = (
request.instructions
if hasattr(request, "instructions") and request.instructions
else ""
)
if req_instruct:
return req_instruct
return self.options.get("instruct", "") or ""
def _detect_mode(self, request):
"""Detect which mode to use based on request parameters."""
# Priority: VoiceClone > VoiceDesign > CustomVoice
@@ -338,8 +371,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if self.audio_path or self.voices:
return "VoiceClone"
# VoiceDesign: instruct option is provided
if "instruct" in self.options and self.options["instruct"]:
# VoiceDesign: instruct provided per-request or via YAML option
if self._effective_instruct(request):
return "VoiceDesign"
# Default to CustomVoice
@@ -690,10 +723,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if do_sample is not None:
generation_kwargs["do_sample"] = do_sample
instruct = self.options.get("instruct", "")
# Prefer the per-request instruction (TTSRequest.instructions) over the
# static YAML `instruct` option. This lets clients set a different style
# (CustomVoice emotion) or designed voice (VoiceDesign) per request.
instruct = self._effective_instruct(request)
if instruct is not None and instruct != "":
generation_kwargs["instruct"] = instruct
# Merge any per-request backend-specific params (TTSRequest.params).
# Values arrive as strings on the wire; coerce to int/float/bool so the
# model receives the types it expects. These override YAML-derived kwargs.
if hasattr(request, "params") and request.params:
for key, value in request.params.items():
generation_kwargs[key] = coerce_param_value(value)
# Generate audio based on mode
if mode == "VoiceClone":
# VoiceClone mode

View File

@@ -123,14 +123,14 @@ var _ = Describe("X-LocalAI-Node ctx propagation contract", func() {
})
It("ModelTTS forwards the request context to the SmartRouter", func() {
_, _, err := backend.ModelTTS(reqCtx, "hello", "", "", loader, appCfg, modelCfg)
_, _, err := backend.ModelTTS(reqCtx, "hello", "", "", "", nil, loader, appCfg, modelCfg)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("router short-circuit (test)"))
stampViaRouterCtx()
})
It("ModelTTSStream forwards the request context to the SmartRouter", func() {
err := backend.ModelTTSStream(reqCtx, "hello", "", "", loader, appCfg, modelCfg, func([]byte) error { return nil })
err := backend.ModelTTSStream(reqCtx, "hello", "", "", "", nil, loader, appCfg, modelCfg, func([]byte) error { return nil })
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("router short-circuit (test)"))
stampViaRouterCtx()

View File

@@ -20,11 +20,32 @@ import (
"github.com/mudler/LocalAI/pkg/utils"
)
// newTTSRequest assembles the gRPC TTSRequest from the per-request inputs. The
// optional instructions string is only attached when non-empty so backends can
// distinguish "no per-request instruction" (fall back to YAML) from an explicit
// empty one. params is forwarded as-is (nil when unset).
func newTTSRequest(text, modelPath, voice, dst, language, instructions string, params map[string]string) *proto.TTSRequest {
req := &proto.TTSRequest{
Text: text,
Model: modelPath,
Voice: voice,
Dst: dst,
Language: &language,
Params: params,
}
if instructions != "" {
req.Instructions = &instructions
}
return req
}
func ModelTTS(
ctx context.Context,
text,
voice,
language string,
language,
instructions string,
params map[string]string,
loader *model.ModelLoader,
appConfig *config.ApplicationConfig,
modelConfig config.ModelConfig,
@@ -74,13 +95,9 @@ func ModelTTS(
startTime = time.Now()
}
res, err := ttsModel.TTS(ctx, &proto.TTSRequest{
Text: text,
Model: modelPath,
Voice: voice,
Dst: filePath,
Language: &language,
})
ttsRequest := newTTSRequest(text, modelPath, voice, filePath, language, instructions, params)
res, err := ttsModel.TTS(ctx, ttsRequest)
if appConfig.EnableTracing {
errStr := ""
@@ -128,7 +145,9 @@ func ModelTTSStream(
ctx context.Context,
text,
voice,
language string,
language,
instructions string,
params map[string]string,
loader *model.ModelLoader,
appConfig *config.ApplicationConfig,
modelConfig config.ModelConfig,
@@ -177,12 +196,10 @@ func ModelTTSStream(
var totalPCMBytes int
snippetCapped := false
err = ttsModel.TTSStream(ctx, &proto.TTSRequest{
Text: text,
Model: modelPath,
Voice: voice,
Language: &language,
}, func(reply *proto.Reply) {
// Streaming TTS writes to the HTTP response, not a file, so dst is empty.
ttsRequest := newTTSRequest(text, modelPath, voice, "", language, instructions, params)
err = ttsModel.TTSStream(ctx, ttsRequest, func(reply *proto.Reply) {
// First message contains sample rate info
if !headerSent && len(reply.Message) > 0 {
var info map[string]any

42
core/backend/tts_test.go Normal file
View File

@@ -0,0 +1,42 @@
package backend
// Specs for the TTSRequest assembly that carries the per-request
// instructions/params from the OpenAI `instructions` field (and the LocalAI
// `params` extension) through to the gRPC boundary. Before this plumbing the
// instruction value was dropped before reaching the backend; these specs pin
// that it now survives, and that the empty case stays backward compatible.
import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("newTTSRequest", func() {
It("attaches the instructions when a per-request value is set", func() {
req := newTTSRequest("hi", "/m", "alloy", "/out.wav", "en", "cheerful narrator", nil)
Expect(req.Instructions).ToNot(BeNil())
Expect(req.GetInstructions()).To(Equal("cheerful narrator"))
Expect(req.GetText()).To(Equal("hi"))
Expect(req.GetVoice()).To(Equal("alloy"))
Expect(req.GetDst()).To(Equal("/out.wav"))
Expect(req.GetLanguage()).To(Equal("en"))
})
It("leaves instructions unset when empty so backends fall back to YAML", func() {
req := newTTSRequest("hi", "/m", "", "/out.wav", "", "", nil)
Expect(req.Instructions).To(BeNil())
Expect(req.GetInstructions()).To(Equal(""))
})
It("forwards per-request params through to the backend", func() {
params := map[string]string{"exaggeration": "0.7", "cfg_weight": "0.3"}
req := newTTSRequest("hi", "/m", "", "/out.wav", "", "", params)
Expect(req.GetParams()).To(HaveKeyWithValue("exaggeration", "0.7"))
Expect(req.GetParams()).To(HaveKeyWithValue("cfg_weight", "0.3"))
})
It("leaves params nil when none are supplied", func() {
req := newTTSRequest("hi", "/m", "", "/out.wav", "", "", nil)
Expect(req.GetParams()).To(BeNil())
})
})

View File

@@ -62,7 +62,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
options.Backend = t.Backend
options.Model = t.Model
filePath, _, err := backend.ModelTTS(context.Background(), text, t.Voice, t.Language, ml, opts, options)
filePath, _, err := backend.ModelTTS(context.Background(), text, t.Voice, t.Language, "", nil, ml, opts, options)
if err != nil {
return err
}

View File

@@ -37,7 +37,7 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig
xlog.Debug("elevenlabs TTS request received", "modelName", input.ModelID)
filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Text, voiceID, input.LanguageCode, ml, appConfig, *cfg)
filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Text, voiceID, input.LanguageCode, "", nil, ml, appConfig, *cfg)
if err != nil {
return err
}

View File

@@ -59,7 +59,7 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig
c.Response().Header().Set("Connection", "keep-alive")
// Stream audio chunks as they're generated
err := backend.ModelTTSStream(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, ml, appConfig, *cfg, func(audioChunk []byte) error {
err := backend.ModelTTSStream(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, input.Instructions, input.Params, ml, appConfig, *cfg, func(audioChunk []byte) error {
_, writeErr := c.Response().Write(audioChunk)
if writeErr != nil {
return writeErr
@@ -75,7 +75,7 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig
}
// Non-streaming TTS (existing behavior)
filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, input.Instructions, input.Params, ml, appConfig, *cfg)
if err != nil {
return err
}

View File

@@ -313,7 +313,7 @@ func newRealtimeDecisionID() string {
}
func (m *wrappedModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) {
return backend.ModelTTS(ctx, text, voice, language, m.modelLoader, m.appConfig, *m.TTSConfig)
return backend.ModelTTS(ctx, text, voice, language, "", nil, m.modelLoader, m.appConfig, *m.TTSConfig)
}
func (m *wrappedModel) PredictConfig() *config.ModelConfig {

View File

@@ -60,6 +60,14 @@ type TTSRequest struct {
Format string `json:"response_format,omitempty" yaml:"response_format,omitempty"` // (optional) output format
Stream bool `json:"stream,omitempty" yaml:"stream,omitempty"` // (optional) enable streaming TTS
SampleRate int `json:"sample_rate,omitempty" yaml:"sample_rate,omitempty"` // (optional) desired output sample rate
// Instructions is a free-form, per-request style/voice description. It maps to
// the OpenAI `instructions` field and is forwarded to the backend so expressive
// TTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed
// voice per request instead of only via the static YAML option.
Instructions string `json:"instructions,omitempty" yaml:"instructions,omitempty"`
// Params carries optional, backend-specific per-request generation parameters
// (LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature).
Params map[string]string `json:"params,omitempty" yaml:"params,omitempty"`
}
// @Description VAD request body

View File

@@ -337,6 +337,37 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
}' | aplay
```
#### Per-request instructions
Instead of (or in addition to) the static YAML `instruct` option, you can pass an
`instructions` string per request. It maps to the OpenAI
[`instructions`](https://platform.openai.com/docs/api-reference/audio/createSpeech) field
and takes precedence over the YAML option when set, falling back to it when empty. This lets
a single model config serve a different emotion (CustomVoice) or a different designed voice
(VoiceDesign) on every request - useful for roleplay/narration clients that need many voices:
```
curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{
"model": "qwen-tts-design",
"input": "Hello world, this is a test.",
"instructions": "A calm, low-pitched elderly storyteller with a warm tone."
}' | aplay
```
Backends that do not support style/voice instructions simply ignore the field.
You can also pass backend-specific generation parameters per request via the LocalAI
`params` extension (a string-to-string map; values are coerced to the backend's expected
types). For example, with the Chatterbox backend:
```
curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{
"model": "chatterbox",
"input": "Hello world, this is a test.",
"params": { "exaggeration": "0.7", "cfg_weight": "0.3", "temperature": "0.8" }
}' | aplay
```
#### Voice Clone Mode
Voice Clone allows you to clone a voice from reference audio. Configure the model with an `AudioPath` and optional `ref_text`:

View File

@@ -5897,6 +5897,10 @@ const docTemplate = `{
"description": "text input",
"type": "string"
},
"instructions": {
"description": "Instructions is a free-form, per-request style/voice description. It maps to\nthe OpenAI ` + "`" + `instructions` + "`" + ` field and is forwarded to the backend so expressive\nTTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed\nvoice per request instead of only via the static YAML option.",
"type": "string"
},
"language": {
"description": "(optional) language to use with TTS model",
"type": "string"
@@ -5904,6 +5908,13 @@ const docTemplate = `{
"model": {
"type": "string"
},
"params": {
"description": "Params carries optional, backend-specific per-request generation parameters\n(LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature).",
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"response_format": {
"description": "(optional) output format",
"type": "string"

View File

@@ -5894,6 +5894,10 @@
"description": "text input",
"type": "string"
},
"instructions": {
"description": "Instructions is a free-form, per-request style/voice description. It maps to\nthe OpenAI `instructions` field and is forwarded to the backend so expressive\nTTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed\nvoice per request instead of only via the static YAML option.",
"type": "string"
},
"language": {
"description": "(optional) language to use with TTS model",
"type": "string"
@@ -5901,6 +5905,13 @@
"model": {
"type": "string"
},
"params": {
"description": "Params carries optional, backend-specific per-request generation parameters\n(LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature).",
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"response_format": {
"description": "(optional) output format",
"type": "string"

View File

@@ -1996,11 +1996,25 @@ definitions:
input:
description: text input
type: string
instructions:
description: |-
Instructions is a free-form, per-request style/voice description. It maps to
the OpenAI `instructions` field and is forwarded to the backend so expressive
TTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed
voice per request instead of only via the static YAML option.
type: string
language:
description: (optional) language to use with TTS model
type: string
model:
type: string
params:
additionalProperties:
type: string
description: |-
Params carries optional, backend-specific per-request generation parameters
(LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature).
type: object
response_format:
description: (optional) output format
type: string