mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-16 12:49:08 -04:00
The OpenAI-compatible TTS endpoint accepts an `instructions` field, but it was silently dropped at the HTTP->gRPC boundary: neither schema.TTSRequest nor the gRPC TTSRequest proto carried it, so backends could only read such a value from static YAML options (identical for every request). This blocked per-line emotion/style and, for Qwen3-TTS VoiceDesign, limited a model config to a single designed voice. Plumb a generic per-request instruction string end to end, plus an optional backend-specific params map: - proto: add `optional string instructions` and `map<string,string> params` to TTSRequest. - schema: add Instructions (maps OpenAI `instructions`) and Params (LocalAI extension) to schema.TTSRequest. - core: thread both through ModelTTS/ModelTTSStream via a newTTSRequest helper that attaches instructions only when non-empty (so backends can fall back to YAML when unset); forward them from the /v1/audio/speech handler. - qwen-tts: prefer the per-request instruction over the YAML `instruct` option (used by both mode detection and generation) and merge per-request params. - chatterbox: merge per-request params (coerced to float/int/bool) over YAML options into generate() kwargs. Fully backward compatible: empty instructions fall back to the YAML option and backends that don't support style/voice instructions ignore the field. Closes #10164 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
51 lines
1.7 KiB
Go
51 lines
1.7 KiB
Go
package elevenlabs
|
|
|
|
import (
|
|
"path/filepath"
|
|
|
|
"github.com/labstack/echo/v4"
|
|
"github.com/mudler/LocalAI/core/backend"
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/core/http/middleware"
|
|
"github.com/mudler/LocalAI/core/schema"
|
|
"github.com/mudler/LocalAI/pkg/audio"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
|
|
// @Summary Generates audio from the input text.
|
|
// @Tags audio
|
|
// @Param voice-id path string true "Account ID"
|
|
// @Param request body schema.TTSRequest true "query params"
|
|
// @Success 200 {string} binary "Response"
|
|
// @Router /v1/text-to-speech/{voice-id} [post]
|
|
func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
|
|
return func(c echo.Context) error {
|
|
|
|
voiceID := c.Param("voice-id")
|
|
|
|
input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.ElevenLabsTTSRequest)
|
|
if !ok || input.ModelID == "" {
|
|
return echo.ErrBadRequest
|
|
}
|
|
|
|
cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
|
|
if !ok || cfg == nil {
|
|
return echo.ErrBadRequest
|
|
}
|
|
|
|
xlog.Debug("elevenlabs TTS request received", "modelName", input.ModelID)
|
|
|
|
filePath, _, err := backend.ModelTTS(c.Request().Context(), input.Text, voiceID, input.LanguageCode, "", nil, ml, appConfig, *cfg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
filePath, contentType := audio.NormalizeAudioFile(filePath)
|
|
if contentType != "" {
|
|
c.Response().Header().Set("Content-Type", contentType)
|
|
}
|
|
return c.Attachment(filePath, filepath.Base(filePath))
|
|
}
|
|
}
|