mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-05 07:16:10 -04:00
The OpenAI-compatible TTS endpoint accepts an `instructions` field, but it was silently dropped at the HTTP->gRPC boundary: neither schema.TTSRequest nor the gRPC TTSRequest proto carried it, so backends could only read such a value from static YAML options (identical for every request). This blocked per-line emotion/style and, for Qwen3-TTS VoiceDesign, limited a model config to a single designed voice. Plumb a generic per-request instruction string end to end, plus an optional backend-specific params map: - proto: add `optional string instructions` and `map<string,string> params` to TTSRequest. - schema: add Instructions (maps OpenAI `instructions`) and Params (LocalAI extension) to schema.TTSRequest. - core: thread both through ModelTTS/ModelTTSStream via a newTTSRequest helper that attaches instructions only when non-empty (so backends can fall back to YAML when unset); forward them from the /v1/audio/speech handler. - qwen-tts: prefer the per-request instruction over the YAML `instruct` option (used by both mode detection and generation) and merge per-request params. - chatterbox: merge per-request params (coerced to float/int/bool) over YAML options into generate() kwargs. Fully backward compatible: empty instructions fall back to the YAML option and backends that don't support style/voice instructions ignore the field. Closes #10164 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
79 lines
2.0 KiB
Go
79 lines
2.0 KiB
Go
package cli
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/mudler/LocalAI/core/backend"
|
|
cliContext "github.com/mudler/LocalAI/core/cli/context"
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
"github.com/mudler/LocalAI/pkg/system"
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
type TTSCMD struct {
|
|
Text []string `arg:""`
|
|
|
|
Backend string `short:"b" default:"piper" help:"Backend to run the TTS model"`
|
|
Model string `short:"m" required:"" help:"Model name to run the TTS"`
|
|
Voice string `short:"v" help:"Voice name to run the TTS"`
|
|
Language string `short:"l" help:"Language to use with the TTS"`
|
|
OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"`
|
|
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
|
}
|
|
|
|
func (t *TTSCMD) Run(ctx *cliContext.Context) error {
|
|
outputFile := t.OutputFile
|
|
outputDir := os.TempDir()
|
|
if outputFile != "" {
|
|
outputDir = filepath.Dir(outputFile)
|
|
}
|
|
|
|
text := strings.Join(t.Text, " ")
|
|
|
|
systemState, err := system.GetSystemState(
|
|
system.WithModelPath(t.ModelsPath),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
opts := &config.ApplicationConfig{
|
|
SystemState: systemState,
|
|
Context: context.Background(),
|
|
GeneratedContentDir: outputDir,
|
|
}
|
|
|
|
ml := model.NewModelLoader(systemState)
|
|
|
|
defer func() {
|
|
err := ml.StopAllGRPC()
|
|
if err != nil {
|
|
xlog.Error("unable to stop all grpc processes", "error", err)
|
|
}
|
|
}()
|
|
|
|
options := config.ModelConfig{}
|
|
options.SetDefaults()
|
|
options.Backend = t.Backend
|
|
options.Model = t.Model
|
|
|
|
filePath, _, err := backend.ModelTTS(context.Background(), text, t.Voice, t.Language, "", nil, ml, opts, options)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if outputFile != "" {
|
|
if err := os.Rename(filePath, outputFile); err != nil {
|
|
return err
|
|
}
|
|
fmt.Printf("Generate file %s\n", outputFile)
|
|
} else {
|
|
fmt.Printf("Generate file %s\n", filePath)
|
|
}
|
|
return nil
|
|
}
|