feat(api): Add transcribe response format request parameter & adjust STT backends (#8318)

* WIP response format implementation for audio transcriptions

(cherry picked from commit e271dd764bbc13846accf3beb8b6522153aa276f)
Signed-off-by: Andres Smith <andressmithdev@pm.me>

* Rework transcript response_format and add more formats

(cherry picked from commit 6a93a8f63e2ee5726bca2980b0c9cf4ef8b7aeb8)
Signed-off-by: Andres Smith <andressmithdev@pm.me>

* Add test and replace go-openai package with official openai go client

(cherry picked from commit f25d1a04e46526429c89db4c739e1e65942ca893)
Signed-off-by: Andres Smith <andressmithdev@pm.me>

* Fix faster-whisper backend and refactor transcription formatting to also work on CLI

Signed-off-by: Andres Smith <andressmithdev@pm.me>
(cherry picked from commit 69a93977d5e113eb7172bd85a0f918592d3d2168)
Signed-off-by: Andres Smith <andressmithdev@pm.me>

---------

Signed-off-by: Andres Smith <andressmithdev@pm.me>
Co-authored-by: nanoandrew4 <nanoandrew4@gmail.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
Andres
2026-02-01 17:33:17 +01:00
committed by GitHub
parent 397f7f0862
commit b6459ddd57
18 changed files with 353 additions and 184 deletions

View File

@@ -0,0 +1,41 @@
package format
import (
"fmt"
"strings"
"time"
"github.com/mudler/LocalAI/core/schema"
)
func TranscriptionResponse(tr *schema.TranscriptionResult, resFmt schema.TranscriptionResponseFormatType) string {
var out string
if resFmt == schema.TranscriptionResponseFormatLrc {
out = "[by:LocalAI]\n[re:LocalAI]\n"
} else if resFmt == schema.TranscriptionResponseFormatVtt {
out = "WEBVTT"
}
for i, s := range tr.Segments {
switch resFmt {
case schema.TranscriptionResponseFormatLrc:
m := s.Start.Milliseconds()
out += fmt.Sprintf("\n[%02d:%02d:%02d] %s", m/60000, (m/1000)%60, (m%1000)/10, strings.TrimSpace(s.Text))
case schema.TranscriptionResponseFormatSrt:
out += fmt.Sprintf("\n\n%d\n%s --> %s\n%s", i+1, durationStr(s.Start, ','), durationStr(s.End, ','), strings.TrimSpace(s.Text))
case schema.TranscriptionResponseFormatVtt:
out += fmt.Sprintf("\n\n%s --> %s\n%s\n", durationStr(s.Start, '.'), durationStr(s.End, '.'), strings.TrimSpace(s.Text))
case schema.TranscriptionResponseFormatText:
fallthrough
default:
out += fmt.Sprintf("\n%s", strings.TrimSpace(s.Text))
}
}
return out
}
func durationStr(d time.Duration, millisSeparator rune) string {
m := d.Milliseconds()
return fmt.Sprintf("%02d:%02d:%02d%c%03d", m/3600000, m/60000, int(d.Seconds())%60, millisSeparator, m%1000)
}

View File

@@ -17,7 +17,7 @@ const (
LLamaCPP = "llama-cpp"
)
var Aliases map[string]string = map[string]string{
var Aliases = map[string]string{
"go-llama": LLamaCPP,
"llama": LLamaCPP,
"embedded-store": LocalStoreBackend,
@@ -29,7 +29,7 @@ var Aliases map[string]string = map[string]string{
"stablediffusion": StableDiffusionGGMLBackend,
}
var TypeAlias map[string]string = map[string]string{
var TypeAlias = map[string]string{
"sentencetransformers": "SentenceTransformer",
"huggingface-embeddings": "SentenceTransformer",
"mamba": "Mamba",
@@ -75,7 +75,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
// Check if the backend is provided as external
if uri, ok := ml.GetAllExternalBackends(o)[backend]; ok {
xlog.Debug("Loading external backend", "uri", uri)
// check if uri is a file or a address
// check if uri is a file or an address
if fi, err := os.Stat(uri); err == nil {
xlog.Debug("external backend is file", "file", fi)
serverAddress, err := getFreeAddress()