Files
LocalAI/core/cli/tts.go
LocalAI [bot] 27e63b9a78 feat(tts): support per-request instructions and params (#10172)
The OpenAI-compatible TTS endpoint accepts an `instructions` field, but it
was silently dropped at the HTTP->gRPC boundary: neither schema.TTSRequest
nor the gRPC TTSRequest proto carried it, so backends could only read such a
value from static YAML options (identical for every request). This blocked
per-line emotion/style and, for Qwen3-TTS VoiceDesign, limited a model config
to a single designed voice.

Plumb a generic per-request instruction string end to end, plus an optional
backend-specific params map:

- proto: add `optional string instructions` and `map<string,string> params`
  to TTSRequest.
- schema: add Instructions (maps OpenAI `instructions`) and Params (LocalAI
  extension) to schema.TTSRequest.
- core: thread both through ModelTTS/ModelTTSStream via a newTTSRequest helper
  that attaches instructions only when non-empty (so backends can fall back to
  YAML when unset); forward them from the /v1/audio/speech handler.
- qwen-tts: prefer the per-request instruction over the YAML `instruct` option
  (used by both mode detection and generation) and merge per-request params.
- chatterbox: merge per-request params (coerced to float/int/bool) over YAML
  options into generate() kwargs.

Fully backward compatible: empty instructions fall back to the YAML option and
backends that don't support style/voice instructions ignore the field.

Closes #10164


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-04 11:45:02 +02:00

79 lines
2.0 KiB
Go

package cli
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/mudler/LocalAI/core/backend"
cliContext "github.com/mudler/LocalAI/core/cli/context"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/LocalAI/pkg/system"
"github.com/mudler/xlog"
)
type TTSCMD struct {
Text []string `arg:""`
Backend string `short:"b" default:"piper" help:"Backend to run the TTS model"`
Model string `short:"m" required:"" help:"Model name to run the TTS"`
Voice string `short:"v" help:"Voice name to run the TTS"`
Language string `short:"l" help:"Language to use with the TTS"`
OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"`
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
}
func (t *TTSCMD) Run(ctx *cliContext.Context) error {
outputFile := t.OutputFile
outputDir := os.TempDir()
if outputFile != "" {
outputDir = filepath.Dir(outputFile)
}
text := strings.Join(t.Text, " ")
systemState, err := system.GetSystemState(
system.WithModelPath(t.ModelsPath),
)
if err != nil {
return err
}
opts := &config.ApplicationConfig{
SystemState: systemState,
Context: context.Background(),
GeneratedContentDir: outputDir,
}
ml := model.NewModelLoader(systemState)
defer func() {
err := ml.StopAllGRPC()
if err != nil {
xlog.Error("unable to stop all grpc processes", "error", err)
}
}()
options := config.ModelConfig{}
options.SetDefaults()
options.Backend = t.Backend
options.Model = t.Model
filePath, _, err := backend.ModelTTS(context.Background(), text, t.Voice, t.Language, "", nil, ml, opts, options)
if err != nil {
return err
}
if outputFile != "" {
if err := os.Rename(filePath, outputFile); err != nil {
return err
}
fmt.Printf("Generate file %s\n", outputFile)
} else {
fmt.Printf("Generate file %s\n", filePath)
}
return nil
}