mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-05 04:02:45 -05:00
feat(api): Add transcribe response format request parameter & adjust STT backends (#8318)
* WIP response format implementation for audio transcriptions (cherry picked from commit e271dd764bbc13846accf3beb8b6522153aa276f) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Rework transcript response_format and add more formats (cherry picked from commit 6a93a8f63e2ee5726bca2980b0c9cf4ef8b7aeb8) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Add test and replace go-openai package with official openai go client (cherry picked from commit f25d1a04e46526429c89db4c739e1e65942ca893) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Fix faster-whisper backend and refactor transcription formatting to also work on CLI Signed-off-by: Andres Smith <andressmithdev@pm.me> (cherry picked from commit 69a93977d5e113eb7172bd85a0f918592d3d2168) Signed-off-by: Andres Smith <andressmithdev@pm.me> --------- Signed-off-by: Andres Smith <andressmithdev@pm.me> Co-authored-by: nanoandrew4 <nanoandrew4@gmail.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
@@ -2,32 +2,42 @@ package cli
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
cliContext "github.com/mudler/LocalAI/core/cli/context"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/gallery"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/format"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/system"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
type TranscriptCMD struct {
|
||||
Filename string `arg:""`
|
||||
Filename string `arg:"" name:"file" help:"Audio file to transcribe" type:"path"`
|
||||
|
||||
Backend string `short:"b" default:"whisper" help:"Backend to run the transcription model"`
|
||||
Model string `short:"m" required:"" help:"Model name to run the TTS"`
|
||||
Language string `short:"l" help:"Language of the audio file"`
|
||||
Translate bool `short:"c" help:"Translate the transcription to english"`
|
||||
Diarize bool `short:"d" help:"Mark speaker turns"`
|
||||
Threads int `short:"t" default:"1" help:"Number of threads used for parallel computation"`
|
||||
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
||||
Prompt string `short:"p" help:"Previous transcribed text or words that hint at what the model should expect"`
|
||||
Backend string `short:"b" default:"whisper" help:"Backend to run the transcription model"`
|
||||
Model string `short:"m" required:"" help:"Model name to run the TTS"`
|
||||
Language string `short:"l" help:"Language of the audio file"`
|
||||
Translate bool `short:"c" help:"Translate the transcription to English"`
|
||||
Diarize bool `short:"d" help:"Mark speaker turns"`
|
||||
Threads int `short:"t" default:"1" help:"Number of threads used for parallel computation"`
|
||||
BackendsPath string `env:"LOCALAI_BACKENDS_PATH,BACKENDS_PATH" type:"path" default:"${basepath}/backends" help:"Path containing backends used for inferencing" group:"storage"`
|
||||
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
||||
BackendGalleries string `env:"LOCALAI_BACKEND_GALLERIES,BACKEND_GALLERIES" help:"JSON list of backend galleries" group:"backends" default:"${backends}"`
|
||||
Prompt string `short:"p" help:"Previous transcribed text or words that hint at what the model should expect"`
|
||||
ResponseFormat schema.TranscriptionResponseFormatType `short:"f" default:"" help:"Response format for Whisper models, can be one of (txt, lrc, srt, vtt, json, json_verbose)"`
|
||||
PrettyPrint bool `help:"Used with response_format json or json_verbose for pretty printing"`
|
||||
}
|
||||
|
||||
func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
|
||||
systemState, err := system.GetSystemState(
|
||||
system.WithBackendPath(t.BackendsPath),
|
||||
system.WithModelPath(t.ModelsPath),
|
||||
)
|
||||
if err != nil {
|
||||
@@ -40,6 +50,11 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
|
||||
|
||||
cl := config.NewModelConfigLoader(t.ModelsPath)
|
||||
ml := model.NewModelLoader(systemState)
|
||||
|
||||
if err := gallery.RegisterBackends(systemState, ml); err != nil {
|
||||
xlog.Error("error registering external backends", "error", err)
|
||||
}
|
||||
|
||||
if err := cl.LoadModelConfigsFromPath(t.ModelsPath); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -62,8 +77,29 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, segment := range tr.Segments {
|
||||
fmt.Println(segment.Start.String(), "-", segment.Text)
|
||||
|
||||
switch t.ResponseFormat {
|
||||
case schema.TranscriptionResponseFormatLrc, schema.TranscriptionResponseFormatSrt, schema.TranscriptionResponseFormatVtt, schema.TranscriptionResponseFormatText:
|
||||
fmt.Println(format.TranscriptionResponse(tr, t.ResponseFormat))
|
||||
case schema.TranscriptionResponseFormatJson:
|
||||
tr.Segments = nil
|
||||
fallthrough
|
||||
case schema.TranscriptionResponseFormatJsonVerbose:
|
||||
var mtr []byte
|
||||
var err error
|
||||
if t.PrettyPrint {
|
||||
mtr, err = json.MarshalIndent(tr, "", " ")
|
||||
} else {
|
||||
mtr, err = json.Marshal(tr)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println(string(mtr))
|
||||
default:
|
||||
for _, segment := range tr.Segments {
|
||||
fmt.Println(segment.Start.String(), "-", strings.TrimSpace(segment.Text))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user