mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-26 19:58:58 -05:00
Add sample_rate support to TTS API via post-processing resampling (#8650)
* Initial plan * Add TTS sample_rate support via AudioResample post-processing Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
This commit is contained in:
@@ -79,6 +79,14 @@ func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig
|
||||
return err
|
||||
}
|
||||
|
||||
// Resample to requested sample rate if specified
|
||||
if input.SampleRate > 0 {
|
||||
filePath, err = utils.AudioResample(filePath, input.SampleRate)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Convert generated file to target format
|
||||
filePath, err = utils.AudioConvert(filePath, input.Format)
|
||||
if err != nil {
|
||||
|
||||
@@ -53,7 +53,8 @@ type TTSRequest struct {
|
||||
Backend string `json:"backend" yaml:"backend"`
|
||||
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
|
||||
Format string `json:"response_format,omitempty" yaml:"response_format,omitempty"` // (optional) output format
|
||||
Stream bool `json:"stream,omitempty" yaml:"stream,omitempty"` // (optional) enable streaming TTS
|
||||
Stream bool `json:"stream,omitempty" yaml:"stream,omitempty"` // (optional) enable streaming TTS
|
||||
SampleRate int `json:"sample_rate,omitempty" yaml:"sample_rate,omitempty"` // (optional) desired output sample rate
|
||||
}
|
||||
|
||||
// @Description VAD request body
|
||||
|
||||
@@ -42,6 +42,21 @@ func AudioToWav(src, dst string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// AudioResample resamples an audio file to the given sample rate using ffmpeg.
|
||||
// If sampleRate <= 0, it is a no-op and returns src unchanged.
|
||||
func AudioResample(src string, sampleRate int) (string, error) {
|
||||
if sampleRate <= 0 {
|
||||
return src, nil
|
||||
}
|
||||
dst := strings.Replace(src, ".wav", fmt.Sprintf("_%dhz.wav", sampleRate), 1)
|
||||
commandArgs := []string{"-y", "-i", src, "-ar", fmt.Sprintf("%d", sampleRate), dst}
|
||||
out, err := ffmpegCommand(commandArgs)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error resampling audio: %w out: %s", err, out)
|
||||
}
|
||||
return dst, nil
|
||||
}
|
||||
|
||||
// AudioConvert converts generated wav file from tts to other output formats.
|
||||
// TODO: handle pcm to have 100% parity of supported format from OpenAI
|
||||
func AudioConvert(src string, format string) (string, error) {
|
||||
|
||||
Reference in New Issue
Block a user