From 3ac7301f312e87e4990bf9309c115242617d1c19 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Feb 2026 16:36:27 +0100 Subject: [PATCH] Add `sample_rate` support to TTS API via post-processing resampling (#8650) * Initial plan * Add TTS sample_rate support via AudioResample post-processing Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- core/http/endpoints/localai/tts.go | 8 ++++++++ core/schema/localai.go | 3 ++- pkg/utils/ffmpeg.go | 15 +++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/core/http/endpoints/localai/tts.go b/core/http/endpoints/localai/tts.go index 4e25cb138..49bba5528 100644 --- a/core/http/endpoints/localai/tts.go +++ b/core/http/endpoints/localai/tts.go @@ -79,6 +79,14 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig return err } + // Resample to requested sample rate if specified + if input.SampleRate > 0 { + filePath, err = utils.AudioResample(filePath, input.SampleRate) + if err != nil { + return err + } + } + // Convert generated file to target format filePath, err = utils.AudioConvert(filePath, input.Format) if err != nil { diff --git a/core/schema/localai.go b/core/schema/localai.go index 62373a5cc..7ccf2bb32 100644 --- a/core/schema/localai.go +++ b/core/schema/localai.go @@ -53,7 +53,8 @@ type TTSRequest struct { Backend string `json:"backend" yaml:"backend"` Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model Format string `json:"response_format,omitempty" yaml:"response_format,omitempty"` // (optional) output format - Stream bool `json:"stream,omitempty" yaml:"stream,omitempty"` // (optional) enable streaming TTS + Stream bool `json:"stream,omitempty" yaml:"stream,omitempty"` // (optional) enable streaming TTS + SampleRate int `json:"sample_rate,omitempty" yaml:"sample_rate,omitempty"` // (optional) desired output sample rate } // @Description VAD request body diff --git a/pkg/utils/ffmpeg.go b/pkg/utils/ffmpeg.go index 061017bb5..c2783dbce 100644 --- a/pkg/utils/ffmpeg.go +++ b/pkg/utils/ffmpeg.go @@ -42,6 +42,21 @@ func AudioToWav(src, dst string) error { return nil } +// AudioResample resamples an audio file to the given sample rate using ffmpeg. +// If sampleRate <= 0, it is a no-op and returns src unchanged. +func AudioResample(src string, sampleRate int) (string, error) { + if sampleRate <= 0 { + return src, nil + } + dst := strings.Replace(src, ".wav", fmt.Sprintf("_%dhz.wav", sampleRate), 1) + commandArgs := []string{"-y", "-i", src, "-ar", fmt.Sprintf("%d", sampleRate), dst} + out, err := ffmpegCommand(commandArgs) + if err != nil { + return "", fmt.Errorf("error resampling audio: %w out: %s", err, out) + } + return dst, nil +} + // AudioConvert converts generated wav file from tts to other output formats. // TODO: handle pcm to have 100% parity of supported format from OpenAI func AudioConvert(src string, format string) (string, error) {