mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-03 03:02:38 -05:00
* WIP response format implementation for audio transcriptions (cherry picked from commit e271dd764bbc13846accf3beb8b6522153aa276f) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Rework transcript response_format and add more formats (cherry picked from commit 6a93a8f63e2ee5726bca2980b0c9cf4ef8b7aeb8) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Add test and replace go-openai package with official openai go client (cherry picked from commit f25d1a04e46526429c89db4c739e1e65942ca893) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Fix faster-whisper backend and refactor transcription formatting to also work on CLI Signed-off-by: Andres Smith <andressmithdev@pm.me> (cherry picked from commit 69a93977d5e113eb7172bd85a0f918592d3d2168) Signed-off-by: Andres Smith <andressmithdev@pm.me> --------- Signed-off-by: Andres Smith <andressmithdev@pm.me> Co-authored-by: nanoandrew4 <nanoandrew4@gmail.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
106 lines
4.0 KiB
Go
106 lines
4.0 KiB
Go
package cli
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/mudler/LocalAI/core/backend"
|
|
cliContext "github.com/mudler/LocalAI/core/cli/context"
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/core/gallery"
|
|
"github.com/mudler/LocalAI/core/schema"
|
|
"github.com/mudler/LocalAI/pkg/format"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
"github.com/mudler/LocalAI/pkg/system"
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
type TranscriptCMD struct {
|
|
Filename string `arg:"" name:"file" help:"Audio file to transcribe" type:"path"`
|
|
|
|
Backend string `short:"b" default:"whisper" help:"Backend to run the transcription model"`
|
|
Model string `short:"m" required:"" help:"Model name to run the TTS"`
|
|
Language string `short:"l" help:"Language of the audio file"`
|
|
Translate bool `short:"c" help:"Translate the transcription to English"`
|
|
Diarize bool `short:"d" help:"Mark speaker turns"`
|
|
Threads int `short:"t" default:"1" help:"Number of threads used for parallel computation"`
|
|
BackendsPath string `env:"LOCALAI_BACKENDS_PATH,BACKENDS_PATH" type:"path" default:"${basepath}/backends" help:"Path containing backends used for inferencing" group:"storage"`
|
|
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
|
BackendGalleries string `env:"LOCALAI_BACKEND_GALLERIES,BACKEND_GALLERIES" help:"JSON list of backend galleries" group:"backends" default:"${backends}"`
|
|
Prompt string `short:"p" help:"Previous transcribed text or words that hint at what the model should expect"`
|
|
ResponseFormat schema.TranscriptionResponseFormatType `short:"f" default:"" help:"Response format for Whisper models, can be one of (txt, lrc, srt, vtt, json, json_verbose)"`
|
|
PrettyPrint bool `help:"Used with response_format json or json_verbose for pretty printing"`
|
|
}
|
|
|
|
func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
|
|
systemState, err := system.GetSystemState(
|
|
system.WithBackendPath(t.BackendsPath),
|
|
system.WithModelPath(t.ModelsPath),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
opts := &config.ApplicationConfig{
|
|
SystemState: systemState,
|
|
Context: context.Background(),
|
|
}
|
|
|
|
cl := config.NewModelConfigLoader(t.ModelsPath)
|
|
ml := model.NewModelLoader(systemState)
|
|
|
|
if err := gallery.RegisterBackends(systemState, ml); err != nil {
|
|
xlog.Error("error registering external backends", "error", err)
|
|
}
|
|
|
|
if err := cl.LoadModelConfigsFromPath(t.ModelsPath); err != nil {
|
|
return err
|
|
}
|
|
|
|
c, exists := cl.GetModelConfig(t.Model)
|
|
if !exists {
|
|
return errors.New("model not found")
|
|
}
|
|
|
|
c.Threads = &t.Threads
|
|
|
|
defer func() {
|
|
err := ml.StopAllGRPC()
|
|
if err != nil {
|
|
xlog.Error("unable to stop all grpc processes", "error", err)
|
|
}
|
|
}()
|
|
|
|
tr, err := backend.ModelTranscription(t.Filename, t.Language, t.Translate, t.Diarize, t.Prompt, ml, c, opts)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
switch t.ResponseFormat {
|
|
case schema.TranscriptionResponseFormatLrc, schema.TranscriptionResponseFormatSrt, schema.TranscriptionResponseFormatVtt, schema.TranscriptionResponseFormatText:
|
|
fmt.Println(format.TranscriptionResponse(tr, t.ResponseFormat))
|
|
case schema.TranscriptionResponseFormatJson:
|
|
tr.Segments = nil
|
|
fallthrough
|
|
case schema.TranscriptionResponseFormatJsonVerbose:
|
|
var mtr []byte
|
|
var err error
|
|
if t.PrettyPrint {
|
|
mtr, err = json.MarshalIndent(tr, "", " ")
|
|
} else {
|
|
mtr, err = json.Marshal(tr)
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
fmt.Println(string(mtr))
|
|
default:
|
|
for _, segment := range tr.Segments {
|
|
fmt.Println(segment.Start.String(), "-", strings.TrimSpace(segment.Text))
|
|
}
|
|
}
|
|
return nil
|
|
}
|