Add sample_rate support to TTS API via post-processing resampling (#8650)

* Initial plan

* Add TTS sample_rate support via AudioResample post-processing

Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
This commit is contained in:
Copilot
2026-02-25 16:36:27 +01:00
committed by GitHub
parent c4783a0a05
commit 3ac7301f31
3 changed files with 25 additions and 1 deletions

View File

@@ -79,6 +79,14 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig
return err
}
// Resample to requested sample rate if specified
if input.SampleRate > 0 {
filePath, err = utils.AudioResample(filePath, input.SampleRate)
if err != nil {
return err
}
}
// Convert generated file to target format
filePath, err = utils.AudioConvert(filePath, input.Format)
if err != nil {

View File

@@ -53,7 +53,8 @@ type TTSRequest struct {
Backend string `json:"backend" yaml:"backend"`
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
Format string `json:"response_format,omitempty" yaml:"response_format,omitempty"` // (optional) output format
Stream bool `json:"stream,omitempty" yaml:"stream,omitempty"` // (optional) enable streaming TTS
Stream bool `json:"stream,omitempty" yaml:"stream,omitempty"` // (optional) enable streaming TTS
SampleRate int `json:"sample_rate,omitempty" yaml:"sample_rate,omitempty"` // (optional) desired output sample rate
}
// @Description VAD request body

View File

@@ -42,6 +42,21 @@ func AudioToWav(src, dst string) error {
return nil
}
// AudioResample resamples an audio file to the given sample rate using ffmpeg.
// If sampleRate <= 0, it is a no-op and returns src unchanged.
func AudioResample(src string, sampleRate int) (string, error) {
if sampleRate <= 0 {
return src, nil
}
dst := strings.Replace(src, ".wav", fmt.Sprintf("_%dhz.wav", sampleRate), 1)
commandArgs := []string{"-y", "-i", src, "-ar", fmt.Sprintf("%d", sampleRate), dst}
out, err := ffmpegCommand(commandArgs)
if err != nil {
return "", fmt.Errorf("error resampling audio: %w out: %s", err, out)
}
return dst, nil
}
// AudioConvert converts generated wav file from tts to other output formats.
// TODO: handle pcm to have 100% parity of supported format from OpenAI
func AudioConvert(src string, format string) (string, error) {