Files
LocalAI/core/cli/transcript.go
Andres b6459ddd57 feat(api): Add transcribe response format request parameter & adjust STT backends (#8318)
* WIP response format implementation for audio transcriptions

(cherry picked from commit e271dd764bbc13846accf3beb8b6522153aa276f)
Signed-off-by: Andres Smith <andressmithdev@pm.me>

* Rework transcript response_format and add more formats

(cherry picked from commit 6a93a8f63e2ee5726bca2980b0c9cf4ef8b7aeb8)
Signed-off-by: Andres Smith <andressmithdev@pm.me>

* Add test and replace go-openai package with official openai go client

(cherry picked from commit f25d1a04e46526429c89db4c739e1e65942ca893)
Signed-off-by: Andres Smith <andressmithdev@pm.me>

* Fix faster-whisper backend and refactor transcription formatting to also work on CLI

Signed-off-by: Andres Smith <andressmithdev@pm.me>
(cherry picked from commit 69a93977d5e113eb7172bd85a0f918592d3d2168)
Signed-off-by: Andres Smith <andressmithdev@pm.me>

---------

Signed-off-by: Andres Smith <andressmithdev@pm.me>
Co-authored-by: nanoandrew4 <nanoandrew4@gmail.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-02-01 17:33:17 +01:00

106 lines
4.0 KiB
Go

package cli
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"github.com/mudler/LocalAI/core/backend"
cliContext "github.com/mudler/LocalAI/core/cli/context"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/gallery"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/pkg/format"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/LocalAI/pkg/system"
"github.com/mudler/xlog"
)
type TranscriptCMD struct {
Filename string `arg:"" name:"file" help:"Audio file to transcribe" type:"path"`
Backend string `short:"b" default:"whisper" help:"Backend to run the transcription model"`
Model string `short:"m" required:"" help:"Model name to run the TTS"`
Language string `short:"l" help:"Language of the audio file"`
Translate bool `short:"c" help:"Translate the transcription to English"`
Diarize bool `short:"d" help:"Mark speaker turns"`
Threads int `short:"t" default:"1" help:"Number of threads used for parallel computation"`
BackendsPath string `env:"LOCALAI_BACKENDS_PATH,BACKENDS_PATH" type:"path" default:"${basepath}/backends" help:"Path containing backends used for inferencing" group:"storage"`
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
BackendGalleries string `env:"LOCALAI_BACKEND_GALLERIES,BACKEND_GALLERIES" help:"JSON list of backend galleries" group:"backends" default:"${backends}"`
Prompt string `short:"p" help:"Previous transcribed text or words that hint at what the model should expect"`
ResponseFormat schema.TranscriptionResponseFormatType `short:"f" default:"" help:"Response format for Whisper models, can be one of (txt, lrc, srt, vtt, json, json_verbose)"`
PrettyPrint bool `help:"Used with response_format json or json_verbose for pretty printing"`
}
func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
systemState, err := system.GetSystemState(
system.WithBackendPath(t.BackendsPath),
system.WithModelPath(t.ModelsPath),
)
if err != nil {
return err
}
opts := &config.ApplicationConfig{
SystemState: systemState,
Context: context.Background(),
}
cl := config.NewModelConfigLoader(t.ModelsPath)
ml := model.NewModelLoader(systemState)
if err := gallery.RegisterBackends(systemState, ml); err != nil {
xlog.Error("error registering external backends", "error", err)
}
if err := cl.LoadModelConfigsFromPath(t.ModelsPath); err != nil {
return err
}
c, exists := cl.GetModelConfig(t.Model)
if !exists {
return errors.New("model not found")
}
c.Threads = &t.Threads
defer func() {
err := ml.StopAllGRPC()
if err != nil {
xlog.Error("unable to stop all grpc processes", "error", err)
}
}()
tr, err := backend.ModelTranscription(t.Filename, t.Language, t.Translate, t.Diarize, t.Prompt, ml, c, opts)
if err != nil {
return err
}
switch t.ResponseFormat {
case schema.TranscriptionResponseFormatLrc, schema.TranscriptionResponseFormatSrt, schema.TranscriptionResponseFormatVtt, schema.TranscriptionResponseFormatText:
fmt.Println(format.TranscriptionResponse(tr, t.ResponseFormat))
case schema.TranscriptionResponseFormatJson:
tr.Segments = nil
fallthrough
case schema.TranscriptionResponseFormatJsonVerbose:
var mtr []byte
var err error
if t.PrettyPrint {
mtr, err = json.MarshalIndent(tr, "", " ")
} else {
mtr, err = json.Marshal(tr)
}
if err != nil {
return err
}
fmt.Println(string(mtr))
default:
for _, segment := range tr.Segments {
fmt.Println(segment.Start.String(), "-", strings.TrimSpace(segment.Text))
}
}
return nil
}