Add sample_rate support to TTS API via post-processing resampling (#8650)

* Initial plan * Add TTS sample_rate support via AudioResample post-processing Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-06-03 05:51:53 -04:00 · 2026-02-25 16:36:27 +01:00
parent c4783a0a05
commit 3ac7301f31
3 changed files with 25 additions and 1 deletions
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@@ -79,6 +79,14 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig
 			return err
 		}

+		// Resample to requested sample rate if specified
+		if input.SampleRate > 0 {
+			filePath, err = utils.AudioResample(filePath, input.SampleRate)
+			if err != nil {
+				return err
+			}
+		}
+
 		// Convert generated file to target format
 		filePath, err = utils.AudioConvert(filePath, input.Format)
 		if err != nil {
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -53,7 +53,8 @@ type TTSRequest struct {
 	Backend  string `json:"backend" yaml:"backend"`
 	Language string `json:"language,omitempty" yaml:"language,omitempty"`               // (optional) language to use with TTS model
 	Format   string `json:"response_format,omitempty" yaml:"response_format,omitempty"` // (optional) output format
-	Stream   bool   `json:"stream,omitempty" yaml:"stream,omitempty"`                   // (optional) enable streaming TTS
+	Stream     bool   `json:"stream,omitempty" yaml:"stream,omitempty"`                         // (optional) enable streaming TTS
+	SampleRate int    `json:"sample_rate,omitempty" yaml:"sample_rate,omitempty"`               // (optional) desired output sample rate
 }

 // @Description VAD request body
--- a/pkg/utils/ffmpeg.go
+++ b/pkg/utils/ffmpeg.go
@@ -42,6 +42,21 @@ func AudioToWav(src, dst string) error {
 	return nil
 }

+// AudioResample resamples an audio file to the given sample rate using ffmpeg.
+// If sampleRate <= 0, it is a no-op and returns src unchanged.
+func AudioResample(src string, sampleRate int) (string, error) {
+	if sampleRate <= 0 {
+		return src, nil
+	}
+	dst := strings.Replace(src, ".wav", fmt.Sprintf("_%dhz.wav", sampleRate), 1)
+	commandArgs := []string{"-y", "-i", src, "-ar", fmt.Sprintf("%d", sampleRate), dst}
+	out, err := ffmpegCommand(commandArgs)
+	if err != nil {
+		return "", fmt.Errorf("error resampling audio: %w out: %s", err, out)
+	}
+	return dst, nil
+}
+
 // AudioConvert converts generated wav file from tts to other output formats.
 // TODO: handle pcm to have 100% parity of supported format from OpenAI
 func AudioConvert(src string, format string) (string, error) {