feat(api): Add transcribe response format request parameter & adjust STT backends (#8318)

* WIP response format implementation for audio transcriptions (cherry picked from commit e271dd764bbc13846accf3beb8b6522153aa276f) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Rework transcript response_format and add more formats (cherry picked from commit 6a93a8f63e2ee5726bca2980b0c9cf4ef8b7aeb8) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Add test and replace go-openai package with official openai go client (cherry picked from commit f25d1a04e46526429c89db4c739e1e65942ca893) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Fix faster-whisper backend and refactor transcription formatting to also work on CLI Signed-off-by: Andres Smith <andressmithdev@pm.me> (cherry picked from commit 69a93977d5e113eb7172bd85a0f918592d3d2168) Signed-off-by: Andres Smith <andressmithdev@pm.me> --------- Signed-off-by: Andres Smith <andressmithdev@pm.me> Co-authored-by: nanoandrew4 <nanoandrew4@gmail.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-02-05 04:02:45 -05:00 · 2026-02-01 17:33:17 +01:00
parent 397f7f0862
commit b6459ddd57
18 changed files with 353 additions and 184 deletions
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -2,32 +2,42 @@ package cli

 import (
 	"context"
+	"encoding/json"
 	"errors"
 	"fmt"
+	"strings"

 	"github.com/mudler/LocalAI/core/backend"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/gallery"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/format"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/system"
 	"github.com/mudler/xlog"
 )

 type TranscriptCMD struct {
-	Filename string `arg:""`
+	Filename string `arg:"" name:"file" help:"Audio file to transcribe" type:"path"`

-	Backend    string `short:"b" default:"whisper" help:"Backend to run the transcription model"`
-	Model      string `short:"m" required:"" help:"Model name to run the TTS"`
-	Language   string `short:"l" help:"Language of the audio file"`
-	Translate  bool   `short:"c" help:"Translate the transcription to english"`
-	Diarize    bool   `short:"d" help:"Mark speaker turns"`
-	Threads    int    `short:"t" default:"1" help:"Number of threads used for parallel computation"`
-	ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
-	Prompt     string `short:"p" help:"Previous transcribed text or words that hint at what the model should expect"`
+	Backend          string                                 `short:"b" default:"whisper" help:"Backend to run the transcription model"`
+	Model            string                                 `short:"m" required:"" help:"Model name to run the TTS"`
+	Language         string                                 `short:"l" help:"Language of the audio file"`
+	Translate        bool                                   `short:"c" help:"Translate the transcription to English"`
+	Diarize          bool                                   `short:"d" help:"Mark speaker turns"`
+	Threads          int                                    `short:"t" default:"1" help:"Number of threads used for parallel computation"`
+	BackendsPath     string                                 `env:"LOCALAI_BACKENDS_PATH,BACKENDS_PATH" type:"path" default:"${basepath}/backends" help:"Path containing backends used for inferencing" group:"storage"`
+	ModelsPath       string                                 `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
+	BackendGalleries string                                 `env:"LOCALAI_BACKEND_GALLERIES,BACKEND_GALLERIES" help:"JSON list of backend galleries" group:"backends" default:"${backends}"`
+	Prompt           string                                 `short:"p" help:"Previous transcribed text or words that hint at what the model should expect"`
+	ResponseFormat   schema.TranscriptionResponseFormatType `short:"f" default:"" help:"Response format for Whisper models, can be one of (txt, lrc, srt, vtt, json, json_verbose)"`
+	PrettyPrint      bool                                   `help:"Used with response_format json or json_verbose for pretty printing"`
 }

 func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	systemState, err := system.GetSystemState(
+		system.WithBackendPath(t.BackendsPath),
 		system.WithModelPath(t.ModelsPath),
 	)
 	if err != nil {
@@ -40,6 +50,11 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {

 	cl := config.NewModelConfigLoader(t.ModelsPath)
 	ml := model.NewModelLoader(systemState)
+
+	if err := gallery.RegisterBackends(systemState, ml); err != nil {
+		xlog.Error("error registering external backends", "error", err)
+	}
+
 	if err := cl.LoadModelConfigsFromPath(t.ModelsPath); err != nil {
 		return err
 	}
@@ -62,8 +77,29 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	if err != nil {
 		return err
 	}
-	for _, segment := range tr.Segments {
-		fmt.Println(segment.Start.String(), "-", segment.Text)
+
+	switch t.ResponseFormat {
+	case schema.TranscriptionResponseFormatLrc, schema.TranscriptionResponseFormatSrt, schema.TranscriptionResponseFormatVtt, schema.TranscriptionResponseFormatText:
+		fmt.Println(format.TranscriptionResponse(tr, t.ResponseFormat))
+	case schema.TranscriptionResponseFormatJson:
+		tr.Segments = nil
+		fallthrough
+	case schema.TranscriptionResponseFormatJsonVerbose:
+		var mtr []byte
+		var err error
+		if t.PrettyPrint {
+			mtr, err = json.MarshalIndent(tr, "", "    ")
+		} else {
+			mtr, err = json.Marshal(tr)
+		}
+		if err != nil {
+			return err
+		}
+		fmt.Println(string(mtr))
+	default:
+		for _, segment := range tr.Segments {
+			fmt.Println(segment.Start.String(), "-", strings.TrimSpace(segment.Text))
+		}
 	}
 	return nil
 }