package main import ( "encoding/binary" "fmt" "io" "math" "math/rand/v2" "os" "os/exec" "path/filepath" "sync" "testing" "time" pb "github.com/mudler/LocalAI/pkg/grpc/proto" "github.com/mudler/LocalAI/pkg/sound" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/pion/rtp" "github.com/pion/webrtc/v4" ) func TestOpusBackend(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "Opus Backend Suite") } // --- helpers --- func generateSineWave(freq float64, sampleRate, numSamples int) []int16 { out := make([]int16, numSamples) for i := range out { t := float64(i) / float64(sampleRate) out[i] = int16(math.MaxInt16 / 2 * math.Sin(2*math.Pi*freq*t)) } return out } func computeRMS(samples []int16) float64 { if len(samples) == 0 { return 0 } var sum float64 for _, s := range samples { v := float64(s) sum += v * v } return math.Sqrt(sum / float64(len(samples))) } func estimateFrequency(samples []int16, sampleRate int) float64 { if len(samples) < 2 { return 0 } crossings := 0 for i := 1; i < len(samples); i++ { if (samples[i-1] >= 0 && samples[i] < 0) || (samples[i-1] < 0 && samples[i] >= 0) { crossings++ } } duration := float64(len(samples)) / float64(sampleRate) return float64(crossings) / (2 * duration) } // encodeDecodeRoundtrip uses the Opus backend to encode PCM and decode all // resulting frames, returning the concatenated decoded samples. func encodeDecodeRoundtrip(o *Opus, pcmBytes []byte, sampleRate int) []int16 { encResult, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: int32(sampleRate), Channels: 1, }) Expect(err).ToNot(HaveOccurred(), "AudioEncode") if len(encResult.Frames) == 0 { return nil } decResult, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames, }) Expect(err).ToNot(HaveOccurred(), "AudioDecode") return sound.BytesToInt16sLE(decResult.PcmData) } func extractOpusFramesFromOgg(data []byte) [][]byte { var frames [][]byte pos := 0 pageNum := 0 for pos+27 <= len(data) { Expect(string(data[pos:pos+4])).To(Equal("OggS"), fmt.Sprintf("invalid Ogg page at offset %d", pos)) nSegments := int(data[pos+26]) if pos+27+nSegments > len(data) { break } segTable := data[pos+27 : pos+27+nSegments] dataStart := pos + 27 + nSegments var totalDataSize int for _, s := range segTable { totalDataSize += int(s) } if dataStart+totalDataSize > len(data) { break } if pageNum >= 2 { pageData := data[dataStart : dataStart+totalDataSize] offset := 0 var packet []byte for _, segSize := range segTable { packet = append(packet, pageData[offset:offset+int(segSize)]...) offset += int(segSize) if segSize < 255 { if len(packet) > 0 { frameCopy := make([]byte, len(packet)) copy(frameCopy, packet) frames = append(frames, frameCopy) } packet = nil } } if len(packet) > 0 { frameCopy := make([]byte, len(packet)) copy(frameCopy, packet) frames = append(frames, frameCopy) } } pos = dataStart + totalDataSize pageNum++ } return frames } func parseTestWAV(data []byte) (pcm []byte, sampleRate int) { if len(data) < 44 || string(data[0:4]) != "RIFF" { return data, 0 } pos := 12 sr := int(binary.LittleEndian.Uint32(data[24:28])) for pos+8 <= len(data) { id := string(data[pos : pos+4]) sz := int(binary.LittleEndian.Uint32(data[pos+4 : pos+8])) if id == "data" { end := pos + 8 + sz if end > len(data) { end = len(data) } return data[pos+8 : end], sr } pos += 8 + sz if sz%2 != 0 { pos++ } } return data[44:], sr } func writeOggOpus(path string, frames [][]byte, sampleRate, channels int) error { f, err := os.Create(path) if err != nil { return err } defer f.Close() serial := uint32(0x4C6F6341) // "LocA" var pageSeq uint32 const preSkip = 312 opusHead := make([]byte, 19) copy(opusHead[0:8], "OpusHead") opusHead[8] = 1 opusHead[9] = byte(channels) binary.LittleEndian.PutUint16(opusHead[10:12], uint16(preSkip)) binary.LittleEndian.PutUint32(opusHead[12:16], uint32(sampleRate)) binary.LittleEndian.PutUint16(opusHead[16:18], 0) opusHead[18] = 0 if err := writeOggPage(f, serial, pageSeq, 0, 0x02, [][]byte{opusHead}); err != nil { return err } pageSeq++ opusTags := make([]byte, 16) copy(opusTags[0:8], "OpusTags") binary.LittleEndian.PutUint32(opusTags[8:12], 0) binary.LittleEndian.PutUint32(opusTags[12:16], 0) if err := writeOggPage(f, serial, pageSeq, 0, 0x00, [][]byte{opusTags}); err != nil { return err } pageSeq++ var granulePos uint64 for i, frame := range frames { granulePos += 960 headerType := byte(0x00) if i == len(frames)-1 { headerType = 0x04 } if err := writeOggPage(f, serial, pageSeq, granulePos, headerType, [][]byte{frame}); err != nil { return err } pageSeq++ } return nil } func writeOggPage(w io.Writer, serial, pageSeq uint32, granulePos uint64, headerType byte, packets [][]byte) error { var segments []byte var pageData []byte for _, pkt := range packets { remaining := len(pkt) for remaining >= 255 { segments = append(segments, 255) remaining -= 255 } segments = append(segments, byte(remaining)) pageData = append(pageData, pkt...) } hdr := make([]byte, 27+len(segments)) copy(hdr[0:4], "OggS") hdr[4] = 0 hdr[5] = headerType binary.LittleEndian.PutUint64(hdr[6:14], granulePos) binary.LittleEndian.PutUint32(hdr[14:18], serial) binary.LittleEndian.PutUint32(hdr[18:22], pageSeq) hdr[26] = byte(len(segments)) copy(hdr[27:], segments) crc := oggCRC32(hdr, pageData) binary.LittleEndian.PutUint32(hdr[22:26], crc) if _, err := w.Write(hdr); err != nil { return err } _, err := w.Write(pageData) return err } func oggCRC32(header, data []byte) uint32 { var crc uint32 for _, b := range header { crc = (crc << 8) ^ oggCRCTable[byte(crc>>24)^b] } for _, b := range data { crc = (crc << 8) ^ oggCRCTable[byte(crc>>24)^b] } return crc } var oggCRCTable = func() [256]uint32 { var t [256]uint32 for i := range 256 { r := uint32(i) << 24 for range 8 { if r&0x80000000 != 0 { r = (r << 1) ^ 0x04C11DB7 } else { r <<= 1 } } t[i] = r } return t }() func goertzel(samples []int16, targetFreq float64, sampleRate int) float64 { N := len(samples) if N == 0 { return 0 } k := 0.5 + float64(N)*targetFreq/float64(sampleRate) w := 2 * math.Pi * k / float64(N) coeff := 2 * math.Cos(w) var s1, s2 float64 for _, sample := range samples { s0 := float64(sample) + coeff*s1 - s2 s2 = s1 s1 = s0 } return s1*s1 + s2*s2 - coeff*s1*s2 } func computeTHD(samples []int16, fundamentalHz float64, sampleRate, numHarmonics int) float64 { fundPower := goertzel(samples, fundamentalHz, sampleRate) if fundPower <= 0 { return 0 } var harmonicSum float64 for h := 2; h <= numHarmonics; h++ { harmonicSum += goertzel(samples, fundamentalHz*float64(h), sampleRate) } return math.Sqrt(harmonicSum/fundPower) * 100 } // --- Opus specs --- var _ = Describe("Opus", func() { var o *Opus BeforeEach(func() { o = &Opus{} Expect(o.Load(&pb.ModelOptions{})).To(Succeed()) }) It("decodes Chrome-like VoIP frames", func() { enc, err := NewEncoder(48000, 1, ApplicationVoIP) Expect(err).ToNot(HaveOccurred()) defer enc.Close() Expect(enc.SetBitrate(32000)).To(Succeed()) Expect(enc.SetComplexity(5)).To(Succeed()) sine := generateSineWave(440, 48000, 48000) packet := make([]byte, 4000) var opusFrames [][]byte for offset := 0; offset+opusFrameSize <= len(sine); offset += opusFrameSize { frame := sine[offset : offset+opusFrameSize] n, err := enc.Encode(frame, opusFrameSize, packet) Expect(err).ToNot(HaveOccurred(), "VoIP encode") out := make([]byte, n) copy(out, packet[:n]) opusFrames = append(opusFrames, out) } result, err := o.AudioDecode(&pb.AudioDecodeRequest{Frames: opusFrames}) Expect(err).ToNot(HaveOccurred()) allDecoded := sound.BytesToInt16sLE(result.PcmData) Expect(allDecoded).ToNot(BeEmpty(), "no decoded samples from VoIP encoder") skip := min(len(allDecoded)/4, 48000*100/1000) tail := allDecoded[skip:] rms := computeRMS(tail) GinkgoWriter.Printf("VoIP/SILK roundtrip: %d decoded samples, RMS=%.1f\n", len(allDecoded), rms) Expect(rms).To(BeNumerically(">=", 50), "VoIP decoded RMS is too low; SILK decoder may be broken") }) It("decodes stereo-encoded Opus with a mono decoder", func() { enc, err := NewEncoder(48000, 2, ApplicationVoIP) Expect(err).ToNot(HaveOccurred()) defer enc.Close() Expect(enc.SetBitrate(32000)).To(Succeed()) mono := generateSineWave(440, 48000, 48000) stereo := make([]int16, len(mono)*2) for i, s := range mono { stereo[i*2] = s stereo[i*2+1] = s } packet := make([]byte, 4000) var opusFrames [][]byte for offset := 0; offset+opusFrameSize*2 <= len(stereo); offset += opusFrameSize * 2 { frame := stereo[offset : offset+opusFrameSize*2] n, err := enc.Encode(frame, opusFrameSize, packet) Expect(err).ToNot(HaveOccurred(), "Stereo encode") out := make([]byte, n) copy(out, packet[:n]) opusFrames = append(opusFrames, out) } result, err := o.AudioDecode(&pb.AudioDecodeRequest{Frames: opusFrames}) Expect(err).ToNot(HaveOccurred()) allDecoded := sound.BytesToInt16sLE(result.PcmData) Expect(allDecoded).ToNot(BeEmpty(), "no decoded samples from stereo encoder") skip := min(len(allDecoded)/4, 48000*100/1000) tail := allDecoded[skip:] rms := computeRMS(tail) GinkgoWriter.Printf("Stereo->Mono: %d decoded samples, RMS=%.1f\n", len(allDecoded), rms) Expect(rms).To(BeNumerically(">=", 50), "Stereo->Mono decoded RMS is too low") }) Describe("decoding libopus-encoded audio", func() { var ffmpegPath string var tmpDir string var pcmPath string var sine []int16 BeforeEach(func() { var err error ffmpegPath, err = exec.LookPath("ffmpeg") if err != nil { Skip("ffmpeg not found") } tmpDir = GinkgoT().TempDir() sine = generateSineWave(440, 48000, 48000) pcmBytes := sound.Int16toBytesLE(sine) pcmPath = filepath.Join(tmpDir, "input.raw") Expect(os.WriteFile(pcmPath, pcmBytes, 0644)).To(Succeed()) }) for _, tc := range []struct { name string bitrate string app string }{ {"voip_32k", "32000", "voip"}, {"voip_64k", "64000", "voip"}, {"audio_64k", "64000", "audio"}, {"audio_128k", "128000", "audio"}, } { tc := tc It(tc.name, func() { oggPath := filepath.Join(tmpDir, fmt.Sprintf("libopus_%s_%s.ogg", tc.app, tc.bitrate)) cmd := exec.Command(ffmpegPath, "-y", "-f", "s16le", "-ar", "48000", "-ac", "1", "-i", pcmPath, "-c:a", "libopus", "-b:a", tc.bitrate, "-application", tc.app, "-frame_duration", "20", "-vbr", "on", oggPath, ) out, err := cmd.CombinedOutput() Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("ffmpeg encode: %s", out)) oggData, err := os.ReadFile(oggPath) Expect(err).ToNot(HaveOccurred()) opusFrames := extractOpusFramesFromOgg(oggData) Expect(opusFrames).ToNot(BeEmpty(), "no Opus frames extracted from Ogg container") GinkgoWriter.Printf("Extracted %d Opus frames from libopus encoder (first frame %d bytes)\n", len(opusFrames), len(opusFrames[0])) result, err := o.AudioDecode(&pb.AudioDecodeRequest{Frames: opusFrames}) Expect(err).ToNot(HaveOccurred()) allDecoded := sound.BytesToInt16sLE(result.PcmData) Expect(allDecoded).ToNot(BeEmpty(), "no decoded samples from libopus-encoded Opus") skip := min(len(allDecoded)/4, 48000*100/1000) tail := allDecoded[skip:] rms := computeRMS(tail) freq := estimateFrequency(tail, 48000) GinkgoWriter.Printf("libopus->opus-go: %d decoded samples, RMS=%.1f, freq≈%.0f Hz\n", len(allDecoded), rms, freq) Expect(rms).To(BeNumerically(">=", 50), "RMS is too low — opus-go cannot decode libopus output") Expect(freq).To(BeNumerically("~", 440, 30), fmt.Sprintf("frequency %.0f Hz deviates from expected 440 Hz", freq)) }) } }) It("roundtrips at 48kHz", func() { sine := generateSineWave(440, 48000, 48000) pcmBytes := sound.Int16toBytesLE(sine) decoded := encodeDecodeRoundtrip(o, pcmBytes, 48000) Expect(decoded).ToNot(BeEmpty()) decodedSR := 48000 skipDecoded := decodedSR * 50 / 1000 if skipDecoded > len(decoded)/2 { skipDecoded = len(decoded) / 4 } tail := decoded[skipDecoded:] rms := computeRMS(tail) GinkgoWriter.Printf("48kHz roundtrip: %d decoded samples, RMS=%.1f\n", len(decoded), rms) Expect(rms).To(BeNumerically(">=", 50), "decoded audio RMS is too low; signal appears silent") }) It("roundtrips at 16kHz", func() { sine16k := generateSineWave(440, 16000, 16000) pcmBytes := sound.Int16toBytesLE(sine16k) decoded := encodeDecodeRoundtrip(o, pcmBytes, 16000) Expect(decoded).ToNot(BeEmpty()) decoded16k := sound.ResampleInt16(decoded, 48000, 16000) skip := min(len(decoded16k)/4, 16000*50/1000) tail := decoded16k[skip:] rms := computeRMS(tail) GinkgoWriter.Printf("16kHz roundtrip: %d decoded@48k -> %d resampled@16k, RMS=%.1f\n", len(decoded), len(decoded16k), rms) Expect(rms).To(BeNumerically(">=", 50), "decoded audio RMS is too low; signal appears silent") }) It("returns empty frames for empty input", func() { result, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: []byte{}, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) Expect(result.Frames).To(BeEmpty()) }) It("silently drops sub-frame input", func() { sine := generateSineWave(440, 48000, 500) // < 960 pcmBytes := sound.Int16toBytesLE(sine) result, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) Expect(result.Frames).To(BeEmpty(), fmt.Sprintf("expected 0 frames for %d samples (< 960)", len(sine))) }) It("encodes multiple frames", func() { sine := generateSineWave(440, 48000, 2880) // exactly 3 frames pcmBytes := sound.Int16toBytesLE(sine) result, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) Expect(result.Frames).To(HaveLen(3)) }) It("produces expected decoded frame size", func() { sine := generateSineWave(440, 48000, 960) pcmBytes := sound.Int16toBytesLE(sine) encResult, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) Expect(encResult.Frames).To(HaveLen(1)) decResult, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames, }) Expect(err).ToNot(HaveOccurred()) decoded := sound.BytesToInt16sLE(decResult.PcmData) GinkgoWriter.Printf("Encoder input: 960 samples (20ms @ 48kHz)\n") GinkgoWriter.Printf("Decoder output: %d samples (%.1fms @ 48kHz)\n", len(decoded), float64(len(decoded))/48.0) Expect(len(decoded)).To(SatisfyAny(Equal(960), Equal(480)), fmt.Sprintf("unexpected decoded frame size %d", len(decoded))) }) It("handles the full WebRTC output path", func() { sine16k := generateSineWave(440, 16000, 16000) pcmBytes := sound.Int16toBytesLE(sine16k) decoded := encodeDecodeRoundtrip(o, pcmBytes, 16000) Expect(decoded).ToNot(BeEmpty()) rms := computeRMS(decoded) GinkgoWriter.Printf("WebRTC output path: %d decoded samples at 48kHz, RMS=%.1f\n", len(decoded), rms) Expect(rms).To(BeNumerically(">=", 50), "decoded audio RMS is too low") }) It("handles the full WebRTC input path", func() { sine48k := generateSineWave(440, 48000, 48000) pcmBytes := sound.Int16toBytesLE(sine48k) decoded48k := encodeDecodeRoundtrip(o, pcmBytes, 48000) Expect(decoded48k).ToNot(BeEmpty()) step24k := sound.ResampleInt16(decoded48k, 48000, 24000) webrtcPath := sound.ResampleInt16(step24k, 24000, 16000) rms := computeRMS(webrtcPath) GinkgoWriter.Printf("WebRTC input path: %d decoded@48k -> %d@24k -> %d@16k, RMS=%.1f\n", len(decoded48k), len(step24k), len(webrtcPath), rms) Expect(rms).To(BeNumerically(">=", 50), "WebRTC input path signal lost in pipeline") }) Context("bug documentation", func() { It("documents trailing sample loss", func() { sine := generateSineWave(440, 48000, 1000) pcmBytes := sound.Int16toBytesLE(sine) result, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) Expect(result.Frames).To(HaveLen(1)) decResult, err := o.AudioDecode(&pb.AudioDecodeRequest{Frames: result.Frames}) Expect(err).ToNot(HaveOccurred()) decoded := sound.BytesToInt16sLE(decResult.PcmData) GinkgoWriter.Printf("Input: 1000 samples, Encoded: 1 frame, Decoded: %d samples (40 samples lost)\n", len(decoded)) Expect(len(decoded)).To(BeNumerically("<=", 960), fmt.Sprintf("decoded more samples (%d) than the encoder consumed (960)", len(decoded))) }) It("documents TTS sample rate mismatch", func() { sine24k := generateSineWave(440, 24000, 24000) pcmBytes := sound.Int16toBytesLE(sine24k) decodedBug := encodeDecodeRoundtrip(o, pcmBytes, 16000) decodedCorrect := encodeDecodeRoundtrip(o, pcmBytes, 24000) skipBug := min(len(decodedBug)/4, 48000*100/1000) skipCorrect := min(len(decodedCorrect)/4, 48000*100/1000) bugTail := decodedBug[skipBug:] correctTail := decodedCorrect[skipCorrect:] bugFreq := estimateFrequency(bugTail, 48000) correctFreq := estimateFrequency(correctTail, 48000) GinkgoWriter.Printf("Bug path: %d decoded samples, freq≈%.0f Hz (expected ~660 Hz = 440*1.5)\n", len(decodedBug), bugFreq) GinkgoWriter.Printf("Correct path: %d decoded samples, freq≈%.0f Hz (expected ~440 Hz)\n", len(decodedCorrect), correctFreq) if len(decodedBug) > 0 && len(decodedCorrect) > 0 { ratio := float64(len(decodedBug)) / float64(len(decodedCorrect)) GinkgoWriter.Printf("Sample count ratio (bug/correct): %.2f (expected ~1.5)\n", ratio) Expect(ratio).To(BeNumerically(">=", 1.1), "expected bug path to produce significantly more samples due to wrong resample ratio") } }) }) Context("batch boundary discontinuity", func() { // These tests simulate the exact production pipeline: // Browser encodes → RTP → batch 15 frames (300ms) → decode → resample 48k→16k → append // They test both with and without persistent decoders to verify // that the session_id persistent decoder path works correctly. It("batched decode+resample with persistent decoder matches one-shot", func() { // Encode 3 seconds of 440Hz at 48kHz — enough for 10 batches sine := generateSineWave(440, 48000, 48000*3) pcmBytes := sound.Int16toBytesLE(sine) encResult, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) GinkgoWriter.Printf("Encoded %d frames (%.0fms)\n", len(encResult.Frames), float64(len(encResult.Frames))*20.0) // Ground truth: decode ALL frames with one decoder, resample in one shot decAll, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames, Options: map[string]string{"session_id": "ground-truth"}, }) Expect(err).ToNot(HaveOccurred()) allSamples := sound.BytesToInt16sLE(decAll.PcmData) oneShotResampled := sound.ResampleInt16(allSamples, 48000, 16000) // Production path: decode in 15-frame batches with persistent decoder, // resample each batch independently, concatenate const framesPerBatch = 15 sessionID := "batch-test" var batchedResampled []int16 batchCount := 0 for i := 0; i < len(encResult.Frames); i += framesPerBatch { end := min(i+framesPerBatch, len(encResult.Frames)) decBatch, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames[i:end], Options: map[string]string{"session_id": sessionID}, }) Expect(err).ToNot(HaveOccurred()) batchSamples := sound.BytesToInt16sLE(decBatch.PcmData) batchResampled := sound.ResampleInt16(batchSamples, 48000, 16000) batchedResampled = append(batchedResampled, batchResampled...) batchCount++ } GinkgoWriter.Printf("Decoded in %d batches, oneshot=%d samples, batched=%d samples\n", batchCount, len(oneShotResampled), len(batchedResampled)) // Skip codec startup transient (first 100ms) skip := 16000 * 100 / 1000 oneShotTail := oneShotResampled[skip:] batchedTail := batchedResampled[skip:] minLen := min(len(oneShotTail), len(batchedTail)) // With persistent decoder, batched decode should be nearly identical // to one-shot (only difference is resampler batch boundaries). var maxDiff float64 var sumDiffSq float64 for i := range minLen { diff := math.Abs(float64(oneShotTail[i]) - float64(batchedTail[i])) if diff > maxDiff { maxDiff = diff } sumDiffSq += diff * diff } rmsDiff := math.Sqrt(sumDiffSq / float64(minLen)) GinkgoWriter.Printf("Persistent decoder: maxDiff=%.0f, rmsDiff=%.1f\n", maxDiff, rmsDiff) // Tight threshold: with persistent decoder and fixed resampler, // the output should be very close to one-shot Expect(maxDiff).To(BeNumerically("<", 500), "persistent decoder batched path diverges too much from one-shot") Expect(rmsDiff).To(BeNumerically("<", 50), "RMS deviation too high between batched and one-shot") }) It("fresh decoder per batch produces worse quality than persistent", func() { // This test proves the value of persistent decoders by showing // that fresh decoders produce larger deviations at batch boundaries. sine := generateSineWave(440, 48000, 48000*2) pcmBytes := sound.Int16toBytesLE(sine) encResult, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) // Ground truth: one-shot decode decAll, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames, Options: map[string]string{"session_id": "ref"}, }) Expect(err).ToNot(HaveOccurred()) refSamples := sound.BytesToInt16sLE(decAll.PcmData) const framesPerBatch = 15 // Path A: persistent decoder var persistentSamples []int16 for i := 0; i < len(encResult.Frames); i += framesPerBatch { end := min(i+framesPerBatch, len(encResult.Frames)) dec, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames[i:end], Options: map[string]string{"session_id": "persistent"}, }) Expect(err).ToNot(HaveOccurred()) persistentSamples = append(persistentSamples, sound.BytesToInt16sLE(dec.PcmData)...) } // Path B: fresh decoder per batch (no session_id) var freshSamples []int16 for i := 0; i < len(encResult.Frames); i += framesPerBatch { end := min(i+framesPerBatch, len(encResult.Frames)) dec, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames[i:end], }) Expect(err).ToNot(HaveOccurred()) freshSamples = append(freshSamples, sound.BytesToInt16sLE(dec.PcmData)...) } // Compare both to reference skip := 48000 * 100 / 1000 refTail := refSamples[skip:] persistentTail := persistentSamples[skip:] freshTail := freshSamples[skip:] minLen := min(len(refTail), min(len(persistentTail), len(freshTail))) var persistentMaxDiff, freshMaxDiff float64 for i := range minLen { pd := math.Abs(float64(refTail[i]) - float64(persistentTail[i])) fd := math.Abs(float64(refTail[i]) - float64(freshTail[i])) if pd > persistentMaxDiff { persistentMaxDiff = pd } if fd > freshMaxDiff { freshMaxDiff = fd } } GinkgoWriter.Printf("vs reference: persistent maxDiff=%.0f, fresh maxDiff=%.0f\n", persistentMaxDiff, freshMaxDiff) // Persistent decoder should be closer to reference than fresh Expect(persistentMaxDiff).To(BeNumerically("<=", freshMaxDiff), "persistent decoder should match reference at least as well as fresh decoder") }) It("checks for PCM discontinuities at batch boundaries", func() { // Encode 2 seconds, decode in batches, resample, and check // for anomalous jumps at the exact batch boundaries in the output sine := generateSineWave(440, 48000, 48000*2) pcmBytes := sound.Int16toBytesLE(sine) encResult, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) const framesPerBatch = 15 sessionID := "boundary-check" var batchedOutput []int16 var batchBoundaries []int // indices where batch boundaries fall in output for i := 0; i < len(encResult.Frames); i += framesPerBatch { end := min(i+framesPerBatch, len(encResult.Frames)) dec, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames[i:end], Options: map[string]string{"session_id": sessionID}, }) Expect(err).ToNot(HaveOccurred()) batchSamples := sound.BytesToInt16sLE(dec.PcmData) batchResampled := sound.ResampleInt16(batchSamples, 48000, 16000) if len(batchedOutput) > 0 { batchBoundaries = append(batchBoundaries, len(batchedOutput)) } batchedOutput = append(batchedOutput, batchResampled...) } GinkgoWriter.Printf("Output: %d samples, %d batch boundaries\n", len(batchedOutput), len(batchBoundaries)) // For each batch boundary, check if the sample-to-sample jump // is anomalously large compared to neighboring deltas for bIdx, boundary := range batchBoundaries { if boundary < 10 || boundary+10 >= len(batchedOutput) { continue } jump := math.Abs(float64(batchedOutput[boundary]) - float64(batchedOutput[boundary-1])) // Compute average delta in the 20-sample neighborhood (excluding boundary) var avgDelta float64 count := 0 for i := boundary - 10; i < boundary+10; i++ { if i == boundary-1 || i == boundary { continue } if i+1 < len(batchedOutput) { avgDelta += math.Abs(float64(batchedOutput[i+1]) - float64(batchedOutput[i])) count++ } } if count > 0 { avgDelta /= float64(count) } ratio := 0.0 if avgDelta > 0 { ratio = jump / avgDelta } GinkgoWriter.Printf("Boundary %d (idx %d): jump=%.0f, avg_delta=%.0f, ratio=%.1f\n", bIdx, boundary, jump, avgDelta, ratio) // The boundary jump should not be more than 5x the average // (with codec artifacts, some variation is expected) Expect(jump).To(BeNumerically("<=", avgDelta*5+1), fmt.Sprintf("discontinuity at batch boundary %d: jump=%.0f vs avg=%.0f (ratio=%.1f)", bIdx, jump, avgDelta, ratio)) } }) It("maintains sine wave phase continuity across batches", func() { sine := generateSineWave(440, 48000, 48000*2) // 2 seconds pcmBytes := sound.Int16toBytesLE(sine) encResult, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) // Decode in batches with persistent decoder, resample each const framesPerBatch = 15 sessionID := "phase-test" var fullOutput []int16 for i := 0; i < len(encResult.Frames); i += framesPerBatch { end := min(i+framesPerBatch, len(encResult.Frames)) dec, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames[i:end], Options: map[string]string{"session_id": sessionID}, }) Expect(err).ToNot(HaveOccurred()) samples := sound.BytesToInt16sLE(dec.PcmData) resampled := sound.ResampleInt16(samples, 48000, 16000) fullOutput = append(fullOutput, resampled...) } // Check zero-crossing regularity after startup transient skip := 16000 * 200 / 1000 // skip first 200ms tail := fullOutput[skip:] var crossingPositions []int for i := 1; i < len(tail); i++ { if (tail[i-1] >= 0 && tail[i] < 0) || (tail[i-1] < 0 && tail[i] >= 0) { crossingPositions = append(crossingPositions, i) } } Expect(crossingPositions).ToNot(BeEmpty(), "no zero crossings found") var intervals []float64 for i := 1; i < len(crossingPositions); i++ { intervals = append(intervals, float64(crossingPositions[i]-crossingPositions[i-1])) } var sum float64 for _, v := range intervals { sum += v } mean := sum / float64(len(intervals)) var variance float64 for _, v := range intervals { d := v - mean variance += d * d } stddev := math.Sqrt(variance / float64(len(intervals))) GinkgoWriter.Printf("Zero-crossing intervals: mean=%.2f stddev=%.2f CV=%.3f (expected period ~%.1f)\n", mean, stddev, stddev/mean, 16000.0/440.0/2.0) Expect(stddev/mean).To(BeNumerically("<", 0.15), fmt.Sprintf("irregular zero crossings suggest discontinuity: CV=%.3f", stddev/mean)) // Also check frequency is correct freq := estimateFrequency(tail, 16000) GinkgoWriter.Printf("Estimated frequency: %.0f Hz (expected 440)\n", freq) Expect(freq).To(BeNumerically("~", 440, 20)) }) It("produces identical resampled output for batched vs one-shot resample", func() { // Isolate the resampler from the codec: decode once, then compare // one-shot resample vs batched resample of the same PCM. sine := generateSineWave(440, 48000, 48000*3) pcmBytes := sound.Int16toBytesLE(sine) encResult, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) decResult, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames, Options: map[string]string{"session_id": "resample-test"}, }) Expect(err).ToNot(HaveOccurred()) allSamples := sound.BytesToInt16sLE(decResult.PcmData) // One-shot resample oneShot := sound.ResampleInt16(allSamples, 48000, 16000) // Batched resample (300ms chunks at 48kHz = 14400 samples) batchSize := 48000 * 300 / 1000 var batched []int16 for offset := 0; offset < len(allSamples); offset += batchSize { end := min(offset+batchSize, len(allSamples)) chunk := sound.ResampleInt16(allSamples[offset:end], 48000, 16000) batched = append(batched, chunk...) } Expect(len(batched)).To(Equal(len(oneShot)), fmt.Sprintf("length mismatch: batched=%d oneshot=%d", len(batched), len(oneShot))) // Every sample must be identical — the resampler is deterministic var maxDiff float64 for i := range len(oneShot) { diff := math.Abs(float64(oneShot[i]) - float64(batched[i])) if diff > maxDiff { maxDiff = diff } } GinkgoWriter.Printf("Resample-only: batched vs one-shot maxDiff=%.0f\n", maxDiff) Expect(maxDiff).To(BeNumerically("==", 0), "batched resample should produce identical output to one-shot resample") }) It("writes WAV files for manual inspection", func() { // This test writes WAV files of the batched vs one-shot pipeline // so you can visually/audibly inspect for discontinuities. tmpDir := GinkgoT().TempDir() sine := generateSineWave(440, 48000, 48000*3) // 3 seconds pcmBytes := sound.Int16toBytesLE(sine) encResult, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) // One-shot path (reference) decAll, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames, Options: map[string]string{"session_id": "wav-ref"}, }) Expect(err).ToNot(HaveOccurred()) refSamples := sound.BytesToInt16sLE(decAll.PcmData) refResampled := sound.ResampleInt16(refSamples, 48000, 16000) // Batched path (production simulation) const framesPerBatch = 15 var batchedResampled []int16 for i := 0; i < len(encResult.Frames); i += framesPerBatch { end := min(i+framesPerBatch, len(encResult.Frames)) dec, err := o.AudioDecode(&pb.AudioDecodeRequest{ Frames: encResult.Frames[i:end], Options: map[string]string{"session_id": "wav-batched"}, }) Expect(err).ToNot(HaveOccurred()) samples := sound.BytesToInt16sLE(dec.PcmData) resampled := sound.ResampleInt16(samples, 48000, 16000) batchedResampled = append(batchedResampled, resampled...) } // Write WAV files writeWAV := func(path string, samples []int16, sampleRate int) { dataLen := len(samples) * 2 hdr := make([]byte, 44) copy(hdr[0:4], "RIFF") binary.LittleEndian.PutUint32(hdr[4:8], uint32(36+dataLen)) copy(hdr[8:12], "WAVE") copy(hdr[12:16], "fmt ") binary.LittleEndian.PutUint32(hdr[16:20], 16) // chunk size binary.LittleEndian.PutUint16(hdr[20:22], 1) // PCM binary.LittleEndian.PutUint16(hdr[22:24], 1) // mono binary.LittleEndian.PutUint32(hdr[24:28], uint32(sampleRate)) // sample rate binary.LittleEndian.PutUint32(hdr[28:32], uint32(sampleRate*2)) // byte rate binary.LittleEndian.PutUint16(hdr[32:34], 2) // block align binary.LittleEndian.PutUint16(hdr[34:36], 16) // bits per sample copy(hdr[36:40], "data") binary.LittleEndian.PutUint32(hdr[40:44], uint32(dataLen)) f, err := os.Create(path) Expect(err).ToNot(HaveOccurred()) defer f.Close() _, err = f.Write(hdr) Expect(err).ToNot(HaveOccurred()) _, err = f.Write(sound.Int16toBytesLE(samples)) Expect(err).ToNot(HaveOccurred()) } refPath := filepath.Join(tmpDir, "oneshot_16k.wav") batchedPath := filepath.Join(tmpDir, "batched_16k.wav") writeWAV(refPath, refResampled, 16000) writeWAV(batchedPath, batchedResampled, 16000) GinkgoWriter.Printf("WAV files written for manual inspection:\n") GinkgoWriter.Printf(" Reference: %s\n", refPath) GinkgoWriter.Printf(" Batched: %s\n", batchedPath) GinkgoWriter.Printf(" Ref samples: %d, Batched samples: %d\n", len(refResampled), len(batchedResampled)) }) }) It("produces frames decodable by ffmpeg (cross-library compat)", func() { ffmpegPath, err := exec.LookPath("ffmpeg") if err != nil { Skip("ffmpeg not found") } sine := generateSineWave(440, 48000, 48000) pcmBytes := sound.Int16toBytesLE(sine) result, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcmBytes, SampleRate: 48000, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) Expect(result.Frames).ToNot(BeEmpty()) GinkgoWriter.Printf("opus-go produced %d frames (first frame %d bytes)\n", len(result.Frames), len(result.Frames[0])) tmpDir := GinkgoT().TempDir() oggPath := filepath.Join(tmpDir, "opus_go_output.ogg") Expect(writeOggOpus(oggPath, result.Frames, 48000, 1)).To(Succeed()) decodedWavPath := filepath.Join(tmpDir, "ffmpeg_decoded.wav") cmd := exec.Command(ffmpegPath, "-y", "-i", oggPath, "-ar", "48000", "-ac", "1", "-c:a", "pcm_s16le", decodedWavPath) out, err := cmd.CombinedOutput() Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("ffmpeg failed to decode opus-go output: %s", out)) decodedData, err := os.ReadFile(decodedWavPath) Expect(err).ToNot(HaveOccurred()) decodedPCM, sr := parseTestWAV(decodedData) Expect(sr).ToNot(BeZero(), "ffmpeg output has no WAV header") decodedSamples := sound.BytesToInt16sLE(decodedPCM) skip := min(len(decodedSamples)/4, sr*100/1000) if skip >= len(decodedSamples) { skip = 0 } tail := decodedSamples[skip:] rms := computeRMS(tail) GinkgoWriter.Printf("ffmpeg decoded opus-go output: %d samples at %dHz, RMS=%.1f\n", len(decodedSamples), sr, rms) Expect(rms).To(BeNumerically(">=", 50), "ffmpeg decoded RMS is too low — opus-go frames are likely incompatible with standard decoders") }) It("delivers audio through a full WebRTC pipeline", func() { const ( toneFreq = 440.0 toneSampleRate = 24000 toneDuration = 1 toneAmplitude = 16000 toneNumSamples = toneSampleRate * toneDuration ) pcm := make([]byte, toneNumSamples*2) for i := range toneNumSamples { sample := int16(toneAmplitude * math.Sin(2*math.Pi*toneFreq*float64(i)/float64(toneSampleRate))) binary.LittleEndian.PutUint16(pcm[i*2:], uint16(sample)) } encResult, err := o.AudioEncode(&pb.AudioEncodeRequest{ PcmData: pcm, SampleRate: toneSampleRate, Channels: 1, }) Expect(err).ToNot(HaveOccurred()) Expect(encResult.Frames).ToNot(BeEmpty()) GinkgoWriter.Printf("Encoded %d Opus frames from %d PCM samples at %dHz\n", len(encResult.Frames), toneNumSamples, toneSampleRate) // Create sender PeerConnection senderME := &webrtc.MediaEngine{} Expect(senderME.RegisterDefaultCodecs()).To(Succeed()) senderAPI := webrtc.NewAPI(webrtc.WithMediaEngine(senderME)) senderPC, err := senderAPI.NewPeerConnection(webrtc.Configuration{}) Expect(err).ToNot(HaveOccurred()) defer senderPC.Close() audioTrack, err := webrtc.NewTrackLocalStaticRTP( webrtc.RTPCodecCapability{ MimeType: webrtc.MimeTypeOpus, ClockRate: 48000, Channels: 2, }, "audio", "test", ) Expect(err).ToNot(HaveOccurred()) rtpSender, err := senderPC.AddTrack(audioTrack) Expect(err).ToNot(HaveOccurred()) go func() { buf := make([]byte, 1500) for { if _, _, err := rtpSender.Read(buf); err != nil { return } } }() // Create receiver PeerConnection receiverME := &webrtc.MediaEngine{} Expect(receiverME.RegisterDefaultCodecs()).To(Succeed()) receiverAPI := webrtc.NewAPI(webrtc.WithMediaEngine(receiverME)) receiverPC, err := receiverAPI.NewPeerConnection(webrtc.Configuration{}) Expect(err).ToNot(HaveOccurred()) defer receiverPC.Close() type receivedPacket struct { seqNum uint16 timestamp uint32 marker bool payload []byte } var ( receivedMu sync.Mutex receivedPackets []receivedPacket trackDone = make(chan struct{}) ) receiverPC.OnTrack(func(track *webrtc.TrackRemote, receiver *webrtc.RTPReceiver) { defer close(trackDone) for { pkt, _, err := track.ReadRTP() if err != nil { return } payload := make([]byte, len(pkt.Payload)) copy(payload, pkt.Payload) receivedMu.Lock() receivedPackets = append(receivedPackets, receivedPacket{ seqNum: pkt.Header.SequenceNumber, timestamp: pkt.Header.Timestamp, marker: pkt.Header.Marker, payload: payload, }) receivedMu.Unlock() } }) // Exchange SDP offer, err := senderPC.CreateOffer(nil) Expect(err).ToNot(HaveOccurred()) Expect(senderPC.SetLocalDescription(offer)).To(Succeed()) senderGatherDone := webrtc.GatheringCompletePromise(senderPC) Eventually(senderGatherDone, 5*time.Second).Should(BeClosed()) Expect(receiverPC.SetRemoteDescription(*senderPC.LocalDescription())).To(Succeed()) answer, err := receiverPC.CreateAnswer(nil) Expect(err).ToNot(HaveOccurred()) Expect(receiverPC.SetLocalDescription(answer)).To(Succeed()) receiverGatherDone := webrtc.GatheringCompletePromise(receiverPC) Eventually(receiverGatherDone, 5*time.Second).Should(BeClosed()) Expect(senderPC.SetRemoteDescription(*receiverPC.LocalDescription())).To(Succeed()) // Wait for connection connected := make(chan struct{}) senderPC.OnConnectionStateChange(func(s webrtc.PeerConnectionState) { if s == webrtc.PeerConnectionStateConnected { select { case <-connected: default: close(connected) } } }) Eventually(connected, 5*time.Second).Should(BeClosed()) // Send test tone via RTP const samplesPerFrame = 960 seqNum := uint16(rand.UintN(65536)) timestamp := rand.Uint32() marker := true ticker := time.NewTicker(20 * time.Millisecond) defer ticker.Stop() for i, frame := range encResult.Frames { pkt := &rtp.Packet{ Header: rtp.Header{ Version: 2, Marker: marker, SequenceNumber: seqNum, Timestamp: timestamp, }, Payload: frame, } seqNum++ timestamp += samplesPerFrame marker = false Expect(audioTrack.WriteRTP(pkt)).To(Succeed(), fmt.Sprintf("WriteRTP frame %d", i)) if i < len(encResult.Frames)-1 { <-ticker.C } } // Wait for packets to arrive time.Sleep(500 * time.Millisecond) senderPC.Close() select { case <-trackDone: case <-time.After(2 * time.Second): } // Decode received Opus frames via the backend receivedMu.Lock() pkts := make([]receivedPacket, len(receivedPackets)) copy(pkts, receivedPackets) receivedMu.Unlock() Expect(pkts).ToNot(BeEmpty(), "no RTP packets received") var receivedFrames [][]byte for _, pkt := range pkts { receivedFrames = append(receivedFrames, pkt.payload) } decResult, err := o.AudioDecode(&pb.AudioDecodeRequest{Frames: receivedFrames}) Expect(err).ToNot(HaveOccurred()) allDecoded := sound.BytesToInt16sLE(decResult.PcmData) Expect(allDecoded).ToNot(BeEmpty(), "no decoded samples") // Analyse RTP packet delivery frameLoss := len(encResult.Frames) - len(pkts) seqGaps := 0 for i := 1; i < len(pkts); i++ { expected := pkts[i-1].seqNum + 1 if pkts[i].seqNum != expected { seqGaps++ } } markerCount := 0 for _, pkt := range pkts { if pkt.marker { markerCount++ } } GinkgoWriter.Println("── RTP Delivery ──") GinkgoWriter.Printf(" Frames sent: %d\n", len(encResult.Frames)) GinkgoWriter.Printf(" Packets recv: %d\n", len(pkts)) GinkgoWriter.Printf(" Frame loss: %d\n", frameLoss) GinkgoWriter.Printf(" Sequence gaps: %d\n", seqGaps) GinkgoWriter.Printf(" Marker packets: %d (expect 1)\n", markerCount) // Audio quality metrics skip := 48000 * 100 / 1000 if skip > len(allDecoded)/2 { skip = len(allDecoded) / 4 } tail := allDecoded[skip:] rms := computeRMS(tail) freq := estimateFrequency(tail, 48000) thd := computeTHD(tail, toneFreq, 48000, 10) GinkgoWriter.Println("── Audio Quality ──") GinkgoWriter.Printf(" Decoded samples: %d (%.1f ms at 48kHz)\n", len(allDecoded), float64(len(allDecoded))/48.0) GinkgoWriter.Printf(" RMS level: %.1f\n", rms) GinkgoWriter.Printf(" Peak frequency: %.0f Hz (expected %.0f Hz)\n", freq, toneFreq) GinkgoWriter.Printf(" THD (h2-h10): %.1f%%\n", thd) Expect(frameLoss).To(BeZero(), "lost frames in localhost transport") Expect(seqGaps).To(BeZero(), "sequence number gaps detected") Expect(markerCount).To(Equal(1), "expected exactly 1 marker packet") Expect(rms).To(BeNumerically(">=", 50), "signal appears silent or severely attenuated") Expect(freq).To(BeNumerically("~", toneFreq, 20), fmt.Sprintf("peak frequency %.0f Hz deviates from expected", freq)) Expect(thd).To(BeNumerically("<", 50), "signal is severely distorted") }) })