mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-30 03:55:58 -04:00
feat(backend): Add Sherpa ONNX backend and Omnilingual ASR Adds a new Go backend wrapping sherpa-onnx via purego (no cgo). Same approach as opus/stablediffusion-ggml/whisper — a thin C shim (csrc/shim.c + shim.h → libsherpa-shim.so) wraps the bits purego can't reach directly: nested struct config writes, result-struct field reads, and the streaming TTS callback trampoline. The Go side uses opaque uintptr handles and purego.NewCallback for the TTS callback. Supports: - VAD via sherpa-onnx's Silero VAD - Offline ASR: Whisper, Paraformer, SenseVoice, Omnilingual CTC - Online/streaming ASR: zipformer transducer with endpoint detection (AudioTranscriptionStream emits delta events during decode) - Offline TTS: VITS (LJS, etc.) - Streaming TTS: sherpa-onnx's callback API → PCM chunks on a channel, prefixed by a streaming WAV header Gallery entries: omnilingual-0.3b-ctc-q8-sherpa (1600-language offline ASR), streaming-zipformer-en-sherpa (low-latency streaming ASR), silero-vad-sherpa, vits-ljs-sherpa. E2E coverage: tests/e2e-backends for offline + streaming ASR, tests/e2e for the full realtime pipeline (VAD + STT + TTS). Assisted-by: claude-opus-4-7-1M [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com>
143 lines
4.1 KiB
Go
143 lines
4.1 KiB
Go
package utils
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"os/exec"
|
|
"strings"
|
|
|
|
laudio "github.com/mudler/LocalAI/pkg/audio"
|
|
|
|
"github.com/go-audio/wav"
|
|
)
|
|
|
|
func ffmpegCommand(args []string) (string, error) {
|
|
cmd := exec.Command("ffmpeg", args...) // Constrain this to ffmpeg to permit security scanner to see that the command is safe.
|
|
cmd.Env = []string{}
|
|
out, err := cmd.CombinedOutput()
|
|
return string(out), err
|
|
}
|
|
|
|
// AudioToWav converts audio to wav for transcribe (16 kHz mono s16le).
|
|
// WAV files already in the target format are passed through directly;
|
|
// everything else is converted via ffmpeg.
|
|
//
|
|
// The pass-through uses a hardlink (with a Copy fallback for cross-fs
|
|
// src/dst) rather than Rename — callers may invoke this twice against
|
|
// the same fixture (e.g. once for AudioTranscription and once for
|
|
// AudioTranscriptionStream) and expect the original file to remain.
|
|
func AudioToWav(src, dst string) error {
|
|
if strings.HasSuffix(src, ".wav") && isTargetWav(src) {
|
|
return passthroughWAV(src, dst)
|
|
}
|
|
return convertWithFFmpeg(src, dst)
|
|
}
|
|
|
|
func passthroughWAV(src, dst string) error {
|
|
if err := os.Link(src, dst); err == nil {
|
|
return nil
|
|
}
|
|
// Fallback: copy. Hardlink fails across filesystems (e.g. src on a
|
|
// read-only mount, dst in /tmp) or when the destination already
|
|
// exists — both are fine; just copy bytes.
|
|
in, err := os.Open(src)
|
|
if err != nil {
|
|
return fmt.Errorf("open src: %w", err)
|
|
}
|
|
defer in.Close()
|
|
out, err := os.Create(dst)
|
|
if err != nil {
|
|
return fmt.Errorf("create dst: %w", err)
|
|
}
|
|
defer out.Close()
|
|
if _, err := io.Copy(out, in); err != nil {
|
|
return fmt.Errorf("copy: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// isTargetWav returns true when src is a valid WAV already in the
|
|
// target format (16 kHz, mono, 16-bit PCM).
|
|
func isTargetWav(src string) bool {
|
|
f, err := os.Open(src)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
defer f.Close()
|
|
|
|
dec := wav.NewDecoder(f)
|
|
if !dec.IsValidFile() {
|
|
return false
|
|
}
|
|
return dec.BitDepth == 16 && dec.NumChans == 1 && dec.SampleRate == 16000
|
|
}
|
|
|
|
func convertWithFFmpeg(src, dst string) error {
|
|
commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
|
|
out, err := ffmpegCommand(commandArgs)
|
|
if err != nil {
|
|
return fmt.Errorf("error: %w out: %s", err, out)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// AudioResample resamples an audio file to the given sample rate using ffmpeg.
|
|
// If sampleRate <= 0, it is a no-op and returns src unchanged.
|
|
func AudioResample(src string, sampleRate int) (string, error) {
|
|
if sampleRate <= 0 {
|
|
return src, nil
|
|
}
|
|
dst := strings.Replace(src, ".wav", fmt.Sprintf("_%dhz.wav", sampleRate), 1)
|
|
commandArgs := []string{"-y", "-i", src, "-ar", fmt.Sprintf("%d", sampleRate), dst}
|
|
out, err := ffmpegCommand(commandArgs)
|
|
if err != nil {
|
|
return "", fmt.Errorf("error resampling audio: %w out: %s", err, out)
|
|
}
|
|
return dst, nil
|
|
}
|
|
|
|
// AudioConvert converts generated wav file from tts to other output formats.
|
|
// TODO: handle pcm to have 100% parity of supported format from OpenAI
|
|
func AudioConvert(src string, format string) (string, error) {
|
|
extension := ""
|
|
// compute file extension from format, default to wav
|
|
switch format {
|
|
case "opus":
|
|
extension = ".ogg"
|
|
case "mp3", "aac", "flac":
|
|
extension = fmt.Sprintf(".%s", format)
|
|
default:
|
|
extension = ".wav"
|
|
}
|
|
|
|
// if .wav, do nothing
|
|
if extension == ".wav" {
|
|
return src, nil
|
|
}
|
|
|
|
// naive conversion based on default values and target extension of file
|
|
dst := strings.Replace(src, ".wav", extension, -1)
|
|
commandArgs := []string{"-y", "-i", src, "-vn", dst}
|
|
out, err := ffmpegCommand(commandArgs)
|
|
if err != nil {
|
|
return "", fmt.Errorf("error: %w out: %s", err, out)
|
|
}
|
|
return dst, nil
|
|
}
|
|
|
|
// WriteWav16kFromReader reads all PCM data from r and writes a 16 kHz mono
|
|
// 16-bit WAV to w. Useful when the PCM length is not known in advance.
|
|
func WriteWav16kFromReader(w io.Writer, r io.Reader) error {
|
|
pcm, err := io.ReadAll(r)
|
|
if err != nil {
|
|
return fmt.Errorf("read pcm: %w", err)
|
|
}
|
|
hdr := laudio.NewWAVHeader(uint32(len(pcm)))
|
|
if err := hdr.Write(w); err != nil {
|
|
return fmt.Errorf("write wav header: %w", err)
|
|
}
|
|
_, err = w.Write(pcm)
|
|
return err
|
|
}
|