diff --git a/backend/backend.proto b/backend/backend.proto index 8a0c8e696..3dca83878 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -537,6 +537,15 @@ message TTSRequest { string dst = 3; string voice = 4; optional string language = 5; + // instructions is a free-form, per-request style/voice description (maps to + // the OpenAI `instructions` field). Backends that support expressive synthesis + // (e.g. Qwen3-TTS CustomVoice/VoiceDesign) prefer this over the static YAML + // option when set; backends that don't simply ignore it. + optional string instructions = 6; + // params carries optional, backend-specific per-request generation parameters + // (e.g. Chatterbox exaggeration/cfg_weight/temperature). Values are strings and + // coerced by the backend; unset leaves the backend's configured defaults. + map params = 7; } message VADRequest { diff --git a/backend/python/chatterbox/backend.py b/backend/python/chatterbox/backend.py index 4dffeb95e..434f0faa5 100644 --- a/backend/python/chatterbox/backend.py +++ b/backend/python/chatterbox/backend.py @@ -37,6 +37,20 @@ def is_int(s): except ValueError: return False +def coerce_param_value(value): + """Coerce a TTSRequest.params value (string on the wire) to the type the + Chatterbox generate() kwargs expect (float/int/bool), matching how static + YAML options are coerced at load time. Non-string values pass through.""" + if not isinstance(value, str): + return value + if is_float(value): + return float(value) + if is_int(value): + return int(value) + if value.lower() in ["true", "false"]: + return value.lower() == "true" + return value + def split_text_at_word_boundary(text, max_length=250): """ Split text at word boundaries without truncating words. @@ -191,6 +205,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # add options to kwargs kwargs.update(self.options) + # Merge per-request params (TTSRequest.params), overriding the static + # YAML options. This exposes Chatterbox generation knobs (e.g. + # exaggeration, cfg_weight, temperature) per request. Values arrive as + # strings on the wire and are coerced to float/int/bool. + if hasattr(request, "params") and request.params: + for key, value in request.params.items(): + kwargs[key] = coerce_param_value(value) + # Check if text exceeds 250 characters # (chatterbox does not support long text) # https://github.com/resemble-ai/chatterbox/issues/60 diff --git a/backend/python/qwen-tts/backend.py b/backend/python/qwen-tts/backend.py index f24533966..c73155008 100644 --- a/backend/python/qwen-tts/backend.py +++ b/backend/python/qwen-tts/backend.py @@ -47,6 +47,26 @@ def is_int(s): return False +def coerce_param_value(value): + """Coerce a string param value (from the TTSRequest.params map, which is + string-typed on the wire) into the most specific Python type the model + generation kwargs expect: bool, int, float, else the original string.""" + if not isinstance(value, str): + return value + lowered = value.strip().lower() + if lowered in ("true", "false"): + return lowered == "true" + try: + return int(value) + except ValueError: + pass + try: + return float(value) + except ValueError: + pass + return value + + _ONE_DAY_IN_SECONDS = 60 * 60 * 24 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1 @@ -322,6 +342,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.Result(message="Model loaded successfully", success=True) + def _effective_instruct(self, request): + """Resolve the instruction/style string for this request, preferring the + per-request TTSRequest.instructions value and falling back to the static + YAML `instruct` option. Empty string means "no instruction".""" + req_instruct = ( + request.instructions + if hasattr(request, "instructions") and request.instructions + else "" + ) + if req_instruct: + return req_instruct + return self.options.get("instruct", "") or "" + def _detect_mode(self, request): """Detect which mode to use based on request parameters.""" # Priority: VoiceClone > VoiceDesign > CustomVoice @@ -338,8 +371,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if self.audio_path or self.voices: return "VoiceClone" - # VoiceDesign: instruct option is provided - if "instruct" in self.options and self.options["instruct"]: + # VoiceDesign: instruct provided per-request or via YAML option + if self._effective_instruct(request): return "VoiceDesign" # Default to CustomVoice @@ -690,10 +723,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if do_sample is not None: generation_kwargs["do_sample"] = do_sample - instruct = self.options.get("instruct", "") + # Prefer the per-request instruction (TTSRequest.instructions) over the + # static YAML `instruct` option. This lets clients set a different style + # (CustomVoice emotion) or designed voice (VoiceDesign) per request. + instruct = self._effective_instruct(request) if instruct is not None and instruct != "": generation_kwargs["instruct"] = instruct + # Merge any per-request backend-specific params (TTSRequest.params). + # Values arrive as strings on the wire; coerce to int/float/bool so the + # model receives the types it expects. These override YAML-derived kwargs. + if hasattr(request, "params") and request.params: + for key, value in request.params.items(): + generation_kwargs[key] = coerce_param_value(value) + # Generate audio based on mode if mode == "VoiceClone": # VoiceClone mode diff --git a/core/backend/ctx_propagation_test.go b/core/backend/ctx_propagation_test.go index f95549ce9..34f269aa3 100644 --- a/core/backend/ctx_propagation_test.go +++ b/core/backend/ctx_propagation_test.go @@ -123,14 +123,14 @@ var _ = Describe("X-LocalAI-Node ctx propagation contract", func() { }) It("ModelTTS forwards the request context to the SmartRouter", func() { - _, _, err := backend.ModelTTS(reqCtx, "hello", "", "", loader, appCfg, modelCfg) + _, _, err := backend.ModelTTS(reqCtx, "hello", "", "", "", nil, loader, appCfg, modelCfg) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("router short-circuit (test)")) stampViaRouterCtx() }) It("ModelTTSStream forwards the request context to the SmartRouter", func() { - err := backend.ModelTTSStream(reqCtx, "hello", "", "", loader, appCfg, modelCfg, func([]byte) error { return nil }) + err := backend.ModelTTSStream(reqCtx, "hello", "", "", "", nil, loader, appCfg, modelCfg, func([]byte) error { return nil }) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("router short-circuit (test)")) stampViaRouterCtx() diff --git a/core/backend/tts.go b/core/backend/tts.go index 3ddc159d8..2b49149ae 100644 --- a/core/backend/tts.go +++ b/core/backend/tts.go @@ -20,11 +20,32 @@ import ( "github.com/mudler/LocalAI/pkg/utils" ) +// newTTSRequest assembles the gRPC TTSRequest from the per-request inputs. The +// optional instructions string is only attached when non-empty so backends can +// distinguish "no per-request instruction" (fall back to YAML) from an explicit +// empty one. params is forwarded as-is (nil when unset). +func newTTSRequest(text, modelPath, voice, dst, language, instructions string, params map[string]string) *proto.TTSRequest { + req := &proto.TTSRequest{ + Text: text, + Model: modelPath, + Voice: voice, + Dst: dst, + Language: &language, + Params: params, + } + if instructions != "" { + req.Instructions = &instructions + } + return req +} + func ModelTTS( ctx context.Context, text, voice, - language string, + language, + instructions string, + params map[string]string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, modelConfig config.ModelConfig, @@ -74,13 +95,9 @@ func ModelTTS( startTime = time.Now() } - res, err := ttsModel.TTS(ctx, &proto.TTSRequest{ - Text: text, - Model: modelPath, - Voice: voice, - Dst: filePath, - Language: &language, - }) + ttsRequest := newTTSRequest(text, modelPath, voice, filePath, language, instructions, params) + + res, err := ttsModel.TTS(ctx, ttsRequest) if appConfig.EnableTracing { errStr := "" @@ -128,7 +145,9 @@ func ModelTTSStream( ctx context.Context, text, voice, - language string, + language, + instructions string, + params map[string]string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, modelConfig config.ModelConfig, @@ -177,12 +196,10 @@ func ModelTTSStream( var totalPCMBytes int snippetCapped := false - err = ttsModel.TTSStream(ctx, &proto.TTSRequest{ - Text: text, - Model: modelPath, - Voice: voice, - Language: &language, - }, func(reply *proto.Reply) { + // Streaming TTS writes to the HTTP response, not a file, so dst is empty. + ttsRequest := newTTSRequest(text, modelPath, voice, "", language, instructions, params) + + err = ttsModel.TTSStream(ctx, ttsRequest, func(reply *proto.Reply) { // First message contains sample rate info if !headerSent && len(reply.Message) > 0 { var info map[string]any diff --git a/core/backend/tts_test.go b/core/backend/tts_test.go new file mode 100644 index 000000000..8c3f35598 --- /dev/null +++ b/core/backend/tts_test.go @@ -0,0 +1,42 @@ +package backend + +// Specs for the TTSRequest assembly that carries the per-request +// instructions/params from the OpenAI `instructions` field (and the LocalAI +// `params` extension) through to the gRPC boundary. Before this plumbing the +// instruction value was dropped before reaching the backend; these specs pin +// that it now survives, and that the empty case stays backward compatible. + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("newTTSRequest", func() { + It("attaches the instructions when a per-request value is set", func() { + req := newTTSRequest("hi", "/m", "alloy", "/out.wav", "en", "cheerful narrator", nil) + Expect(req.Instructions).ToNot(BeNil()) + Expect(req.GetInstructions()).To(Equal("cheerful narrator")) + Expect(req.GetText()).To(Equal("hi")) + Expect(req.GetVoice()).To(Equal("alloy")) + Expect(req.GetDst()).To(Equal("/out.wav")) + Expect(req.GetLanguage()).To(Equal("en")) + }) + + It("leaves instructions unset when empty so backends fall back to YAML", func() { + req := newTTSRequest("hi", "/m", "", "/out.wav", "", "", nil) + Expect(req.Instructions).To(BeNil()) + Expect(req.GetInstructions()).To(Equal("")) + }) + + It("forwards per-request params through to the backend", func() { + params := map[string]string{"exaggeration": "0.7", "cfg_weight": "0.3"} + req := newTTSRequest("hi", "/m", "", "/out.wav", "", "", params) + Expect(req.GetParams()).To(HaveKeyWithValue("exaggeration", "0.7")) + Expect(req.GetParams()).To(HaveKeyWithValue("cfg_weight", "0.3")) + }) + + It("leaves params nil when none are supplied", func() { + req := newTTSRequest("hi", "/m", "", "/out.wav", "", "", nil) + Expect(req.GetParams()).To(BeNil()) + }) +}) diff --git a/core/cli/tts.go b/core/cli/tts.go index 0f7b8bc6c..6c3aa6146 100644 --- a/core/cli/tts.go +++ b/core/cli/tts.go @@ -62,7 +62,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error { options.Backend = t.Backend options.Model = t.Model - filePath, _, err := backend.ModelTTS(context.Background(), text, t.Voice, t.Language, ml, opts, options) + filePath, _, err := backend.ModelTTS(context.Background(), text, t.Voice, t.Language, "", nil, ml, opts, options) if err != nil { return err } diff --git a/core/http/endpoints/elevenlabs/tts.go b/core/http/endpoints/elevenlabs/tts.go index 110ae292a..c668eea83 100644 --- a/core/http/endpoints/elevenlabs/tts.go +++ b/core/http/endpoints/elevenlabs/tts.go @@ -37,7 +37,7 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig xlog.Debug("elevenlabs TTS request received", "modelName", input.ModelID) - filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Text, voiceID, input.LanguageCode, ml, appConfig, *cfg) + filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Text, voiceID, input.LanguageCode, "", nil, ml, appConfig, *cfg) if err != nil { return err } diff --git a/core/http/endpoints/localai/tts.go b/core/http/endpoints/localai/tts.go index fe9199c24..4f2a7324c 100644 --- a/core/http/endpoints/localai/tts.go +++ b/core/http/endpoints/localai/tts.go @@ -59,7 +59,7 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig c.Response().Header().Set("Connection", "keep-alive") // Stream audio chunks as they're generated - err := backend.ModelTTSStream(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, ml, appConfig, *cfg, func(audioChunk []byte) error { + err := backend.ModelTTSStream(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, input.Instructions, input.Params, ml, appConfig, *cfg, func(audioChunk []byte) error { _, writeErr := c.Response().Write(audioChunk) if writeErr != nil { return writeErr @@ -75,7 +75,7 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig } // Non-streaming TTS (existing behavior) - filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, ml, appConfig, *cfg) + filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Input, cfg.Voice, cfg.Language, input.Instructions, input.Params, ml, appConfig, *cfg) if err != nil { return err } diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go index 9ec7e3d6b..afb93201b 100644 --- a/core/http/endpoints/openai/realtime_model.go +++ b/core/http/endpoints/openai/realtime_model.go @@ -313,7 +313,7 @@ func newRealtimeDecisionID() string { } func (m *wrappedModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) { - return backend.ModelTTS(ctx, text, voice, language, m.modelLoader, m.appConfig, *m.TTSConfig) + return backend.ModelTTS(ctx, text, voice, language, "", nil, m.modelLoader, m.appConfig, *m.TTSConfig) } func (m *wrappedModel) PredictConfig() *config.ModelConfig { diff --git a/core/schema/localai.go b/core/schema/localai.go index 8704f8ad8..8bb431e35 100644 --- a/core/schema/localai.go +++ b/core/schema/localai.go @@ -60,6 +60,14 @@ type TTSRequest struct { Format string `json:"response_format,omitempty" yaml:"response_format,omitempty"` // (optional) output format Stream bool `json:"stream,omitempty" yaml:"stream,omitempty"` // (optional) enable streaming TTS SampleRate int `json:"sample_rate,omitempty" yaml:"sample_rate,omitempty"` // (optional) desired output sample rate + // Instructions is a free-form, per-request style/voice description. It maps to + // the OpenAI `instructions` field and is forwarded to the backend so expressive + // TTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed + // voice per request instead of only via the static YAML option. + Instructions string `json:"instructions,omitempty" yaml:"instructions,omitempty"` + // Params carries optional, backend-specific per-request generation parameters + // (LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature). + Params map[string]string `json:"params,omitempty" yaml:"params,omitempty"` } // @Description VAD request body diff --git a/docs/content/features/text-to-audio.md b/docs/content/features/text-to-audio.md index 62f745137..7399e01d1 100644 --- a/docs/content/features/text-to-audio.md +++ b/docs/content/features/text-to-audio.md @@ -337,6 +337,37 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ }' | aplay ``` +#### Per-request instructions + +Instead of (or in addition to) the static YAML `instruct` option, you can pass an +`instructions` string per request. It maps to the OpenAI +[`instructions`](https://platform.openai.com/docs/api-reference/audio/createSpeech) field +and takes precedence over the YAML option when set, falling back to it when empty. This lets +a single model config serve a different emotion (CustomVoice) or a different designed voice +(VoiceDesign) on every request - useful for roleplay/narration clients that need many voices: + +``` +curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{ + "model": "qwen-tts-design", + "input": "Hello world, this is a test.", + "instructions": "A calm, low-pitched elderly storyteller with a warm tone." + }' | aplay +``` + +Backends that do not support style/voice instructions simply ignore the field. + +You can also pass backend-specific generation parameters per request via the LocalAI +`params` extension (a string-to-string map; values are coerced to the backend's expected +types). For example, with the Chatterbox backend: + +``` +curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{ + "model": "chatterbox", + "input": "Hello world, this is a test.", + "params": { "exaggeration": "0.7", "cfg_weight": "0.3", "temperature": "0.8" } + }' | aplay +``` + #### Voice Clone Mode Voice Clone allows you to clone a voice from reference audio. Configure the model with an `AudioPath` and optional `ref_text`: diff --git a/swagger/docs.go b/swagger/docs.go index c5d4471a4..8da5bd402 100644 --- a/swagger/docs.go +++ b/swagger/docs.go @@ -5897,6 +5897,10 @@ const docTemplate = `{ "description": "text input", "type": "string" }, + "instructions": { + "description": "Instructions is a free-form, per-request style/voice description. It maps to\nthe OpenAI ` + "`" + `instructions` + "`" + ` field and is forwarded to the backend so expressive\nTTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed\nvoice per request instead of only via the static YAML option.", + "type": "string" + }, "language": { "description": "(optional) language to use with TTS model", "type": "string" @@ -5904,6 +5908,13 @@ const docTemplate = `{ "model": { "type": "string" }, + "params": { + "description": "Params carries optional, backend-specific per-request generation parameters\n(LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature).", + "type": "object", + "additionalProperties": { + "type": "string" + } + }, "response_format": { "description": "(optional) output format", "type": "string" diff --git a/swagger/swagger.json b/swagger/swagger.json index 1729898e9..7863cf235 100644 --- a/swagger/swagger.json +++ b/swagger/swagger.json @@ -5894,6 +5894,10 @@ "description": "text input", "type": "string" }, + "instructions": { + "description": "Instructions is a free-form, per-request style/voice description. It maps to\nthe OpenAI `instructions` field and is forwarded to the backend so expressive\nTTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed\nvoice per request instead of only via the static YAML option.", + "type": "string" + }, "language": { "description": "(optional) language to use with TTS model", "type": "string" @@ -5901,6 +5905,13 @@ "model": { "type": "string" }, + "params": { + "description": "Params carries optional, backend-specific per-request generation parameters\n(LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature).", + "type": "object", + "additionalProperties": { + "type": "string" + } + }, "response_format": { "description": "(optional) output format", "type": "string" diff --git a/swagger/swagger.yaml b/swagger/swagger.yaml index 565094380..320d45436 100644 --- a/swagger/swagger.yaml +++ b/swagger/swagger.yaml @@ -1996,11 +1996,25 @@ definitions: input: description: text input type: string + instructions: + description: |- + Instructions is a free-form, per-request style/voice description. It maps to + the OpenAI `instructions` field and is forwarded to the backend so expressive + TTS models (e.g. Qwen3-TTS CustomVoice/VoiceDesign) can vary tone or designed + voice per request instead of only via the static YAML option. + type: string language: description: (optional) language to use with TTS model type: string model: type: string + params: + additionalProperties: + type: string + description: |- + Params carries optional, backend-specific per-request generation parameters + (LocalAI extension, e.g. Chatterbox exaggeration/cfg_weight/temperature). + type: object response_format: description: (optional) output format type: string