LocalAI/backend/go/omnivoice-cpp/audio.go

package main

import (
	"bytes"
	"encoding/binary"
	"fmt"
	"os"
	"runtime"

	"github.com/go-audio/audio"
	"github.com/go-audio/wav"
)

const omnivoiceSampleRate = 24000

// wavHeader24k returns a 44-byte WAV header for a streaming 24 kHz mono 16-bit
// PCM stream, with placeholder (0xFFFFFFFF) sizes since the total length is
// unknown up front. Emitted as the first chunk of TTSStream so the HTTP layer
// receives a self-describing WAV (the gRPC TTSStream path never sets Message,
// so the backend owns the header - see core/backend/tts.go:ModelTTSStream).
func wavHeader24k() []byte {
	var buf bytes.Buffer
	w := func(v any) { _ = binary.Write(&buf, binary.LittleEndian, v) }
	buf.WriteString("RIFF")
	w(uint32(0xFFFFFFFF))
	buf.WriteString("WAVE")
	buf.WriteString("fmt ")
	w(uint32(16))                      // Subchunk1Size
	w(uint16(1))                       // PCM
	w(uint16(1))                       // mono
	w(uint32(omnivoiceSampleRate))     // sample rate
	w(uint32(omnivoiceSampleRate * 2)) // byte rate = SR * blockAlign
	w(uint16(2))                       // block align (16-bit mono)
	w(uint16(16))                      // bits per sample
	buf.WriteString("data")
	w(uint32(0xFFFFFFFF))
	return buf.Bytes()
}

// floatToPCM16LE clamps each sample to [-1,1] and encodes it as little-endian
// signed 16-bit PCM.
func floatToPCM16LE(samples []float32) []byte {
	out := make([]byte, len(samples)*2)
	for i, s := range samples {
		if s > 1 {
			s = 1
		} else if s < -1 {
			s = -1
		}
		v := int16(s * 32767)
		out[i*2] = byte(v)
		out[i*2+1] = byte(v >> 8)
	}
	return out
}

// writeWAV24k writes samples as a finalized 24 kHz mono 16-bit WAV at dst.
func writeWAV24k(dst string, samples []float32) error {
	f, err := os.Create(dst)
	if err != nil {
		return fmt.Errorf("omnivoice: create %q: %w", dst, err)
	}
	enc := wav.NewEncoder(f, omnivoiceSampleRate, 16, 1, 1)
	ints := make([]int, len(samples))
	for i, s := range samples {
		if s > 1 {
			s = 1
		} else if s < -1 {
			s = -1
		}
		ints[i] = int(s * 32767)
	}
	b := &audio.IntBuffer{
		Format:         &audio.Format{NumChannels: 1, SampleRate: omnivoiceSampleRate},
		Data:           ints,
		SourceBitDepth: 16,
	}
	if err := enc.Write(b); err != nil {
		_ = enc.Close()
		_ = f.Close()
		return fmt.Errorf("omnivoice: encode WAV: %w", err)
	}
	if err := enc.Close(); err != nil {
		_ = f.Close()
		return fmt.Errorf("omnivoice: finalize WAV: %w", err)
	}
	return f.Close()
}

// readWAVAsFloat decodes a WAV file (any sample rate/channels) to a mono
// float32 slice in [-1,1] for use as reference audio. OmniVoice expects 24 kHz;
// callers should supply 24 kHz reference clips.
func readWAVAsFloat(path string) ([]float32, error) {
	f, err := os.Open(path)
	if err != nil {
		return nil, fmt.Errorf("omnivoice: open ref %q: %w", path, err)
	}
	defer func() { _ = f.Close() }()

	dec := wav.NewDecoder(f)
	buf, err := dec.FullPCMBuffer()
	if err != nil {
		return nil, fmt.Errorf("omnivoice: decode ref %q: %w", path, err)
	}
	ch := int(buf.Format.NumChannels)
	if ch < 1 {
		ch = 1
	}
	bitDepth := int(buf.SourceBitDepth)
	if bitDepth == 0 {
		bitDepth = 16
	}
	scale := float32(int64(1) << uint(bitDepth-1))
	n := len(buf.Data) / ch
	out := make([]float32, n)
	for i := 0; i < n; i++ {
		// Downmix to mono by averaging channels.
		var acc int
		for c := 0; c < ch; c++ {
			acc += buf.Data[i*ch+c]
		}
		out[i] = float32(acc) / float32(ch) / scale
	}
	return out, nil
}

// runtimeKeepAlive prevents the GC from reclaiming the reference-audio slice
// while its backing pointer is in use across the C call.
func runtimeKeepAlive(v any) { runtime.KeepAlive(v) }