diff --git a/backend/go/localvqe/Makefile b/backend/go/localvqe/Makefile
index b607288fc..7b66e9371 100644
--- a/backend/go/localvqe/Makefile
+++ b/backend/go/localvqe/Makefile
@@ -9,7 +9,7 @@ JOBS?=$(shell nproc --ignore=1)
 # LocalVQE upstream version pin. Bump to a specific commit when picking up
 # a new release; `main` works for development but is not reproducible.
 LOCALVQE_REPO?=https://github.com/localai-org/LocalVQE
-LOCALVQE_VERSION?=72bfb4c6
+LOCALVQE_VERSION?=b0f0378a450e87c871b85689554801601ca56d98
 
 # LocalVQE handles CPU feature selection internally (it ships the multiple
 # libggml-cpu-*.so variants and its loader picks the best one at runtime
@@ -27,7 +27,8 @@ endif
 
 # LocalVQE upstream supports CPU + Vulkan only. Other BUILD_TYPE values
 # fall through to the default CPU build — Vulkan is already as fast as the
-# specialised GPU paths would be on this 1.3 M-parameter model.
+# specialised GPU paths would be on these small (1.3 M–4.8 M parameter)
+# models.
 ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON -DLOCALVQE_VULKAN=ON
 else ifeq ($(OS),Darwin)
diff --git a/backend/go/localvqe/golocalvqe.go b/backend/go/localvqe/golocalvqe.go
index b0575c3be..5d7c862d5 100644
--- a/backend/go/localvqe/golocalvqe.go
+++ b/backend/go/localvqe/golocalvqe.go
@@ -3,7 +3,6 @@ package main
 import (
 	"encoding/binary"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
 	"runtime"
@@ -11,6 +10,7 @@ import (
 	"strings"
 	"unsafe"
 
+	"github.com/go-audio/wav"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/xlog"
@@ -46,24 +46,24 @@ const (
 // through the options builder (CppOptionsNew + setters + CppNewWithOptions)
 // — the bare localvqe_new path doesn't expose backend / device selection.
 var (
-	CppOptionsNew           func() uintptr
-	CppOptionsFree          func(opts uintptr)
-	CppOptionsSetModelPath  func(opts uintptr, modelPath string) int32
-	CppOptionsSetBackend    func(opts uintptr, backend string) int32
-	CppOptionsSetDevice     func(opts uintptr, device int32) int32
-	CppNewWithOptions       func(opts uintptr) uintptr
-	CppFree                 func(ctx uintptr)
-	CppProcessF32           func(ctx uintptr, mic, ref uintptr, nSamples int32, out uintptr) int32
-	CppProcessS16           func(ctx uintptr, mic, ref uintptr, nSamples int32, out uintptr) int32
-	CppProcessFrameF32      func(ctx uintptr, mic, ref uintptr, hopSamples int32, out uintptr) int32
-	CppProcessFrameS16      func(ctx uintptr, mic, ref uintptr, hopSamples int32, out uintptr) int32
-	CppReset                func(ctx uintptr)
-	CppLastError            func(ctx uintptr) string
-	CppSampleRate           func(ctx uintptr) int32
-	CppHopLength            func(ctx uintptr) int32
-	CppFFTSize              func(ctx uintptr) int32
-	CppSetNoiseGate         func(ctx uintptr, enabled int32, thresholdDBFS float32) int32
-	CppGetNoiseGate         func(ctx uintptr, enabledOut, thresholdDBFSOut uintptr) int32
+	CppOptionsNew          func() uintptr
+	CppOptionsFree         func(opts uintptr)
+	CppOptionsSetModelPath func(opts uintptr, modelPath string) int32
+	CppOptionsSetBackend   func(opts uintptr, backend string) int32
+	CppOptionsSetDevice    func(opts uintptr, device int32) int32
+	CppNewWithOptions      func(opts uintptr) uintptr
+	CppFree                func(ctx uintptr)
+	CppProcessF32          func(ctx uintptr, mic, ref uintptr, nSamples int32, out uintptr) int32
+	CppProcessS16          func(ctx uintptr, mic, ref uintptr, nSamples int32, out uintptr) int32
+	CppProcessFrameF32     func(ctx uintptr, mic, ref uintptr, hopSamples int32, out uintptr) int32
+	CppProcessFrameS16     func(ctx uintptr, mic, ref uintptr, hopSamples int32, out uintptr) int32
+	CppReset               func(ctx uintptr)
+	CppLastError           func(ctx uintptr) string
+	CppSampleRate          func(ctx uintptr) int32
+	CppHopLength           func(ctx uintptr) int32
+	CppFFTSize             func(ctx uintptr) int32
+	CppSetNoiseGate        func(ctx uintptr, enabled int32, thresholdDBFS float32) int32
+	CppGetNoiseGate        func(ctx uintptr, enabledOut, thresholdDBFSOut uintptr) int32
 )
 
 // LocalVQE speaks gRPC against LocalVQE's flat C ABI. The streaming
@@ -490,11 +490,14 @@ func (v *LocalVQE) applyStreamConfig(cfg *pb.AudioTransformStreamConfig) error {
 
 // ---- WAV I/O ----------------------------------------------------------
 //
-// Minimal mono PCM WAV reader/writer. Only handles the subset LocalVQE
-// cares about (mono, 16-bit signed, no extensible chunks). For broader
-// audio support the HTTP layer's `audio.NormalizeAudioFile` already
-// converts arbitrary input to a canonical WAV before we see it; this
-// reader just decodes the canonical shape.
+// Reader/writer for the mono 16-bit PCM shape LocalVQE works with. Decoding
+// goes through the shared go-audio/wav decoder (as the whisper and parakeet
+// backends do) so RIFF chunk walking is handled robustly — an 18/40-byte
+// extensible `fmt ` chunk, or JUNK/bext/LIST metadata before or after `data`
+// (e.g. ffmpeg's trailing "Lavf" tag), is skipped rather than spliced into
+// the PCM stream as an audible click. The HTTP layer normalises arbitrary
+// input to WAV before we see it, but that WAV is ffmpeg output and is not
+// guaranteed to be the canonical 44-byte layout.
 
 func readMonoWAVf32(path string) ([]float32, int, error) {
 	f, err := os.Open(path)
@@ -502,35 +505,26 @@ func readMonoWAVf32(path string) ([]float32, int, error) {
 		return nil, 0, err
 	}
 	defer func() { _ = f.Close() }()
-	header := make([]byte, 44)
-	if _, err := io.ReadFull(f, header); err != nil {
-		return nil, 0, err
+
+	buf, err := wav.NewDecoder(f).FullPCMBuffer()
+	if err != nil {
+		return nil, 0, fmt.Errorf("decode WAV: %w", err)
 	}
-	if string(header[0:4]) != "RIFF" || string(header[8:12]) != "WAVE" {
+	if buf == nil || buf.Format == nil {
 		return nil, 0, fmt.Errorf("not a WAV file")
 	}
-	channels := binary.LittleEndian.Uint16(header[22:24])
-	sampleRate := binary.LittleEndian.Uint32(header[24:28])
-	bitsPerSample := binary.LittleEndian.Uint16(header[34:36])
-
-	if channels != 1 {
-		return nil, 0, fmt.Errorf("only mono WAV supported (got %d channels)", channels)
+	if buf.Format.NumChannels != 1 {
+		return nil, 0, fmt.Errorf("only mono WAV supported (got %d channels)", buf.Format.NumChannels)
 	}
-	if bitsPerSample != 16 {
-		return nil, 0, fmt.Errorf("only 16-bit PCM supported (got %d bits)", bitsPerSample)
+	if buf.SourceBitDepth != 16 {
+		return nil, 0, fmt.Errorf("only 16-bit PCM supported (got %d bits)", buf.SourceBitDepth)
 	}
-
-	rest, err := io.ReadAll(f)
-	if err != nil {
-		return nil, 0, err
+	if len(buf.Data) == 0 {
+		return nil, 0, fmt.Errorf("WAV has no audio data")
 	}
-	n := len(rest) / 2
-	out := make([]float32, n)
-	for i := 0; i < n; i++ {
-		s := int16(binary.LittleEndian.Uint16(rest[i*2 : i*2+2]))
-		out[i] = float32(s) / 32768.0
-	}
-	return out, int(sampleRate), nil
+	// AsFloat32Buffer normalises by 2^(bitDepth-1) == /32768 for 16-bit,
+	// matching the model's expected [-1, 1) input range.
+	return buf.AsFloat32Buffer().Data, buf.Format.SampleRate, nil
 }
 
 func writeMonoWAVf32(path string, samples []float32, sampleRate int) error {
@@ -546,13 +540,13 @@ func writeMonoWAVf32(path string, samples []float32, sampleRate int) error {
 	binary.LittleEndian.PutUint32(header[4:8], 36+dataLen)
 	copy(header[8:12], []byte("WAVE"))
 	copy(header[12:16], []byte("fmt "))
-	binary.LittleEndian.PutUint32(header[16:20], 16)        // fmt chunk size
-	binary.LittleEndian.PutUint16(header[20:22], 1)         // PCM
-	binary.LittleEndian.PutUint16(header[22:24], 1)         // mono
+	binary.LittleEndian.PutUint32(header[16:20], 16) // fmt chunk size
+	binary.LittleEndian.PutUint16(header[20:22], 1)  // PCM
+	binary.LittleEndian.PutUint16(header[22:24], 1)  // mono
 	binary.LittleEndian.PutUint32(header[24:28], uint32(sampleRate))
 	binary.LittleEndian.PutUint32(header[28:32], uint32(sampleRate*2)) // byte rate
-	binary.LittleEndian.PutUint16(header[32:34], 2)         // block align
-	binary.LittleEndian.PutUint16(header[34:36], 16)        // bits per sample
+	binary.LittleEndian.PutUint16(header[32:34], 2)                    // block align
+	binary.LittleEndian.PutUint16(header[34:36], 16)                   // bits per sample
 	copy(header[36:40], []byte("data"))
 	binary.LittleEndian.PutUint32(header[40:44], dataLen)
 	if _, err := f.Write(header); err != nil {
diff --git a/backend/go/localvqe/localvqe_test.go b/backend/go/localvqe/localvqe_test.go
index 5053dfeb1..60541441e 100644
--- a/backend/go/localvqe/localvqe_test.go
+++ b/backend/go/localvqe/localvqe_test.go
@@ -1,7 +1,9 @@
 package main
 
 import (
+	"encoding/binary"
 	"os"
+	"path/filepath"
 	"testing"
 
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
@@ -92,6 +94,147 @@ var _ = Describe("LocalVQE-cpp", func() {
 		})
 	})
 
+	Context("readMonoWAVf32 chunk parsing", func() {
+		// chunk builds a word-aligned RIFF sub-chunk (id + size + body + pad).
+		chunk := func(id string, body []byte) []byte {
+			out := append([]byte(id), 0, 0, 0, 0)
+			binary.LittleEndian.PutUint32(out[4:8], uint32(len(body)))
+			out = append(out, body...)
+			if len(body)&1 == 1 {
+				out = append(out, 0) // pad byte for odd-sized chunks
+			}
+			return out
+		}
+		// fmtBody returns a PCM `fmt ` chunk body. extra bytes simulate the
+		// 18/40-byte extensible form (cbSize + extension).
+		fmtBody := func(channels, bits uint16, rate uint32, extra int) []byte {
+			b := make([]byte, 16+extra)
+			binary.LittleEndian.PutUint16(b[0:2], 1) // PCM
+			binary.LittleEndian.PutUint16(b[2:4], channels)
+			binary.LittleEndian.PutUint32(b[4:8], rate)
+			binary.LittleEndian.PutUint32(b[8:12], rate*uint32(channels)*uint32(bits)/8)
+			binary.LittleEndian.PutUint16(b[12:14], channels*bits/8)
+			binary.LittleEndian.PutUint16(b[14:16], bits)
+			if extra >= 2 {
+				binary.LittleEndian.PutUint16(b[16:18], uint16(extra-2)) // cbSize
+			}
+			return b
+		}
+		// pcm encodes int16 samples little-endian.
+		pcm := func(samples ...int16) []byte {
+			b := make([]byte, len(samples)*2)
+			for i, s := range samples {
+				binary.LittleEndian.PutUint16(b[i*2:i*2+2], uint16(s))
+			}
+			return b
+		}
+		riff := func(chunks ...[]byte) []byte {
+			body := []byte("WAVE")
+			for _, c := range chunks {
+				body = append(body, c...)
+			}
+			out := append([]byte("RIFF"), 0, 0, 0, 0)
+			binary.LittleEndian.PutUint32(out[4:8], uint32(len(body)))
+			return append(out, body...)
+		}
+		writeWAV := func(b []byte) string {
+			p := filepath.Join(GinkgoT().TempDir(), "in.wav")
+			Expect(os.WriteFile(p, b, 0o600)).To(Succeed())
+			return p
+		}
+		// A canonical sample run with distinct values so any off-by-one /
+		// misalignment shows up as wrong numbers, not just wrong length.
+		samples := []int16{1000, -2000, 3000, -4000, 5000, -6000}
+		expectSamples := func(got []float32) {
+			Expect(got).To(HaveLen(len(samples)))
+			for i, s := range samples {
+				Expect(got[i]).To(BeNumerically("~", float32(s)/32768.0, 1e-6))
+			}
+		}
+
+		It("reads a canonical 44-byte WAV", func() {
+			p := writeWAV(riff(chunk("fmt ", fmtBody(1, 16, 16000, 0)), chunk("data", pcm(samples...))))
+			out, sr, err := readMonoWAVf32(p)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(sr).To(Equal(16000))
+			expectSamples(out)
+		})
+
+		It("ignores a LIST/JUNK chunk placed before data (no leading-impulse splice)", func() {
+			p := writeWAV(riff(
+				chunk("fmt ", fmtBody(1, 16, 16000, 0)),
+				chunk("JUNK", []byte("padding-bytes-here!")), // odd length → exercises pad
+				chunk("LIST", []byte("INFOISFTLavf60.0")),
+				chunk("data", pcm(samples...)),
+			))
+			out, sr, err := readMonoWAVf32(p)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(sr).To(Equal(16000))
+			expectSamples(out) // not corrupted by the preceding chunks
+		})
+
+		It("honours the data chunk size and drops a trailing metadata chunk", func() {
+			p := writeWAV(riff(
+				chunk("fmt ", fmtBody(1, 16, 16000, 0)),
+				chunk("data", pcm(samples...)),
+				chunk("LIST", []byte("INFOISFTLavf60.16.100")), // ffmpeg trailer tag
+			))
+			out, _, err := readMonoWAVf32(p)
+			Expect(err).ToNot(HaveOccurred())
+			expectSamples(out) // trailing LIST bytes not decoded as PCM
+		})
+
+		It("handles the 18-byte extensible fmt chunk", func() {
+			p := writeWAV(riff(chunk("fmt ", fmtBody(1, 16, 16000, 2)), chunk("data", pcm(samples...))))
+			out, sr, err := readMonoWAVf32(p)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(sr).To(Equal(16000))
+			expectSamples(out)
+		})
+
+		It("rejects non-mono input", func() {
+			p := writeWAV(riff(chunk("fmt ", fmtBody(2, 16, 16000, 0)), chunk("data", pcm(samples...))))
+			_, _, err := readMonoWAVf32(p)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("mono"))
+		})
+
+		It("rejects non-16-bit input", func() {
+			p := writeWAV(riff(chunk("fmt ", fmtBody(1, 8, 16000, 0)), chunk("data", pcm(samples...))))
+			_, _, err := readMonoWAVf32(p)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("16-bit"))
+		})
+
+		It("rejects a non-WAV file", func() {
+			p := writeWAV([]byte("not a riff file at all"))
+			_, _, err := readMonoWAVf32(p)
+			Expect(err).To(HaveOccurred())
+		})
+
+		It("errors when the data chunk is missing", func() {
+			// fmt but no data: the decoder must fail rather than return an
+			// empty (or garbage) sample slice. The exact message is the
+			// decoder's, so just assert it errors.
+			p := writeWAV(riff(chunk("fmt ", fmtBody(1, 16, 16000, 0))))
+			_, _, err := readMonoWAVf32(p)
+			Expect(err).To(HaveOccurred())
+		})
+
+		It("round-trips through writeMonoWAVf32", func() {
+			p := filepath.Join(GinkgoT().TempDir(), "rt.wav")
+			in := []float32{0.1, -0.2, 0.3, -0.4}
+			Expect(writeMonoWAVf32(p, in, 16000)).To(Succeed())
+			out, sr, err := readMonoWAVf32(p)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(sr).To(Equal(16000))
+			Expect(out).To(HaveLen(len(in)))
+			for i := range in {
+				Expect(out[i]).To(BeNumerically("~", in[i], 1e-4))
+			}
+		})
+	})
+
 	Context("model-gated integration (LOCALVQE_MODEL_PATH)", func() {
 		It("load + sample rate + hop + fft", func() {
 			path := modelPathOrSkip()
diff --git a/core/http/middleware/security_headers.go b/core/http/middleware/security_headers.go
index 9a3ae8d48..969a335a0 100644
--- a/core/http/middleware/security_headers.go
+++ b/core/http/middleware/security_headers.go
@@ -17,7 +17,10 @@ func SecurityHeaders() echo.MiddlewareFunc {
 		"img-src 'self' data: blob: https:; " +
 		"media-src 'self' data: blob:; " +
 		"font-src 'self' data:; " +
-		"connect-src 'self' ws: wss: https:; " +
+		// blob: lets the waveform renderer XHR/fetch a freshly-created object
+		// URL (e.g. an uploaded clip before it has a server URL). XHR/fetch of
+		// blob: falls under connect-src, not media-src.
+		"connect-src 'self' ws: wss: https: blob:; " +
 		"frame-src 'self' blob:; " +
 		"worker-src 'self' blob:; " +
 		"object-src 'none'; " +
diff --git a/core/http/middleware/security_headers_test.go b/core/http/middleware/security_headers_test.go
index af43822ea..76430b093 100644
--- a/core/http/middleware/security_headers_test.go
+++ b/core/http/middleware/security_headers_test.go
@@ -32,6 +32,9 @@ var _ = Describe("SecurityHeaders", func() {
 		Expect(csp).To(ContainSubstring("frame-ancestors 'self'"))
 		Expect(csp).To(ContainSubstring("object-src 'none'"))
 		Expect(csp).To(ContainSubstring("base-uri 'self'"))
+		// blob: must be in connect-src so the waveform renderer can XHR/fetch
+		// a freshly-created object URL (uploaded/enhanced clip).
+		Expect(csp).To(ContainSubstring("connect-src 'self' ws: wss: https: blob:"))
 	})
 
 	It("sets X-Content-Type-Options: nosniff", func() {
diff --git a/core/http/react-ui/coverage-baseline.txt b/core/http/react-ui/coverage-baseline.txt
index 4f8b89534..b4be1a3b7 100644
--- a/core/http/react-ui/coverage-baseline.txt
+++ b/core/http/react-ui/coverage-baseline.txt
@@ -1 +1 @@
-38.29
\ No newline at end of file
+39.86
\ No newline at end of file
diff --git a/core/http/react-ui/e2e/agents.spec.js b/core/http/react-ui/e2e/agents.spec.js
index 40fe2d99c..ebfebd153 100644
--- a/core/http/react-ui/e2e/agents.spec.js
+++ b/core/http/react-ui/e2e/agents.spec.js
@@ -20,5 +20,10 @@ test.describe('Agents page', () => {
       page.waitForURL(/\/app\/agents\/new$/),
       create.click(),
     ])
+    // Wait for AgentCreate.jsx to actually render, not just for the URL to
+    // change. Ending the test the instant the route matched let the component
+    // mount race the coverage teardown — its ~400 lines were collected only
+    // when the render won, swinging total UI coverage ~1pp run-to-run.
+    await expect(page.getByRole('heading', { name: 'Create Agent' })).toBeVisible()
   })
 })
diff --git a/core/http/react-ui/e2e/audio-transform.spec.js b/core/http/react-ui/e2e/audio-transform.spec.js
index 53fbeabe7..c428e95f8 100644
--- a/core/http/react-ui/e2e/audio-transform.spec.js
+++ b/core/http/react-ui/e2e/audio-transform.spec.js
@@ -66,6 +66,33 @@ function makeFakeWav(name) {
   return { name, mimeType: 'audio/wav', buffer: buf }
 }
 
+// Build a WAV carrying a real sine tone, long enough that the spectrogram
+// STFT produces several frames (a few thousand samples). Used to exercise the
+// FFT / heatmap path, which the 4-sample silent fixture can't.
+function makeToneWav(name, freq = 1000, seconds = 0.4, sampleRate = 16000) {
+  const samples = Math.floor(seconds * sampleRate)
+  const dataLen = samples * 2
+  const buf = Buffer.alloc(44 + dataLen)
+  buf.write('RIFF', 0)
+  buf.writeUInt32LE(36 + dataLen, 4)
+  buf.write('WAVE', 8)
+  buf.write('fmt ', 12)
+  buf.writeUInt32LE(16, 16)
+  buf.writeUInt16LE(1, 20)
+  buf.writeUInt16LE(1, 22)
+  buf.writeUInt32LE(sampleRate, 24)
+  buf.writeUInt32LE(sampleRate * 2, 28)
+  buf.writeUInt16LE(2, 32)
+  buf.writeUInt16LE(16, 34)
+  buf.write('data', 36)
+  buf.writeUInt32LE(dataLen, 40)
+  for (let i = 0; i < samples; i++) {
+    const v = Math.round(Math.sin((2 * Math.PI * freq * i) / sampleRate) * 16000)
+    buf.writeInt16LE(v, 44 + i * 2)
+  }
+  return { name, mimeType: 'audio/wav', buffer: buf }
+}
+
 test.describe('Audio Transform', () => {
   test.beforeEach(async ({ page }) => {
     await mockCapabilities(page, [
@@ -169,6 +196,26 @@ test.describe('Audio Transform', () => {
     await expect(page.getByTestId('media-history-item')).toHaveCount(1)
   })
 
+  test('renders an input spectrogram on upload and an output one after transform', async ({ page }) => {
+    mockAudioTransform(page, 'enhanced.wav')
+
+    await page.goto('/app/transform')
+    await expect(page.getByRole('button', { name: 'localvqe' })).toBeVisible({ timeout: 10_000 })
+
+    // Choosing a clip should render its input spectrogram immediately — no
+    // backend round-trip needed (it's computed client-side from the bytes).
+    await page.locator('input[type="file"]').first().setInputFiles(makeToneWav('tone.wav'))
+    await expect(page.getByTestId('spectrogram-input')).toBeVisible({ timeout: 10_000 })
+
+    // Until a transform runs the output side shows a "compare" placeholder.
+    await expect(page.getByText(/Transform to compare/)).toBeVisible()
+
+    await page.getByRole('button', { name: /Transform/ }).last().click()
+
+    // After processing, the output spectrum panel appears alongside the input.
+    await expect(page.getByText('Output spectrum')).toBeVisible({ timeout: 10_000 })
+  })
+
   test('shows an error banner when the backend returns 4xx', async ({ page }) => {
     await page.route('**/audio/transformations', (route) => {
       if (route.request().method() !== 'POST') return route.continue()
diff --git a/core/http/react-ui/src/App.css b/core/http/react-ui/src/App.css
index a71eda5f5..01528ed7b 100644
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -6984,6 +6984,88 @@ select.input {
   color: var(--color-primary);
 }
 
+/* Spectrogram (AudioTransform spectral view) */
+.audio-spectrogram-pair {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: var(--spacing-md);
+}
+@media (max-width: 720px) {
+  .audio-spectrogram-pair {
+    grid-template-columns: 1fr;
+  }
+}
+.audio-spectrogram {
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-xs);
+  width: 100%;
+  min-width: 0;
+}
+.audio-spectrogram__label {
+  font-size: var(--text-sm);
+  color: var(--color-text-secondary);
+}
+.audio-spectrogram__canvas-wrap {
+  position: relative;
+  width: 100%;
+  background: var(--color-surface-sunken);
+  border: 1px solid var(--color-border-subtle);
+  border-radius: var(--radius-md);
+  overflow: hidden;
+}
+.audio-spectrogram__canvas-wrap--empty {
+  display: flex;
+  align-items: center;
+  justify-content: center;
+}
+.audio-spectrogram__hint {
+  color: var(--color-text-muted);
+  font-size: var(--text-sm);
+}
+.audio-spectrogram__loading {
+  position: absolute;
+  inset: 0;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  color: var(--color-text-muted);
+  font-size: var(--text-sm);
+}
+.audio-spectrogram__error {
+  padding: var(--spacing-md);
+  color: var(--color-error);
+  font-size: var(--text-sm);
+}
+.audio-spectrogram__axis {
+  position: absolute;
+  left: 6px;
+  font-size: 10px;
+  color: var(--color-text-muted);
+  background: var(--color-bg-overlay);
+  padding: 0 4px;
+  border-radius: var(--radius-sm);
+  pointer-events: none;
+  font-variant-numeric: tabular-nums;
+}
+.audio-spectrogram__axis--top {
+  top: 4px;
+}
+.audio-spectrogram__axis--bottom {
+  bottom: 4px;
+}
+.audio-spectrogram__duration {
+  position: absolute;
+  right: 8px;
+  bottom: 6px;
+  font-size: 11px;
+  color: var(--color-text-muted);
+  font-variant-numeric: tabular-nums;
+  background: var(--color-bg-overlay);
+  padding: 1px 6px;
+  border-radius: var(--radius-sm);
+}
+
 /* Audio Transform Studio tab */
 .audio-transform-stack {
   display: flex;
diff --git a/core/http/react-ui/src/components/audio/Spectrogram.jsx b/core/http/react-ui/src/components/audio/Spectrogram.jsx
new file mode 100644
index 000000000..98907611c
--- /dev/null
+++ b/core/http/react-ui/src/components/audio/Spectrogram.jsx
@@ -0,0 +1,105 @@
+import { useEffect, useRef } from 'react'
+import useSpectrogram from '../../hooks/useSpectrogram'
+
+// Spectrogram — canvas heatmap of a clip's magnitude STFT (time × frequency).
+// Time runs left→right, frequency low→high bottom→top, brighter = more energy.
+// Used on the AudioTransform page to show input next to output so the user can
+// see which bands the model attenuates (dark gaps that were bright in the
+// input). Mirrors WaveformPlayer's canvas/label/overlay structure.
+export default function Spectrogram({ src, label, height = 140, testId }) {
+  const canvasRef = useRef(null)
+  const { spectrogram, frames, bins, maxFreq, duration, error, loading } = useSpectrogram(src)
+
+  useEffect(() => {
+    const canvas = canvasRef.current
+    if (!canvas) return
+    const dpr = window.devicePixelRatio || 1
+    const cssW = canvas.clientWidth
+    const cssH = height
+    canvas.width = Math.floor(cssW * dpr)
+    canvas.height = Math.floor(cssH * dpr)
+    const ctx = canvas.getContext('2d')
+    ctx.setTransform(dpr, 0, 0, dpr, 0, 0)
+    ctx.clearRect(0, 0, cssW, cssH)
+    if (!spectrogram || !frames || !bins) return
+
+    // Paint at native (frames × bins) resolution into an offscreen canvas,
+    // then let drawImage smooth-scale it up — far cheaper than filling
+    // cssW×cssH rects, and the GPU handles the interpolation.
+    const img = ctx.createImageData(frames, bins)
+    for (let f = 0; f < frames; f++) {
+      for (let b = 0; b < bins; b++) {
+        const [r, g, bl] = magma(spectrogram[f * bins + b])
+        // Flip the frequency axis: image row 0 is the top = highest freq.
+        const o = ((bins - 1 - b) * frames + f) * 4
+        img.data[o] = r
+        img.data[o + 1] = g
+        img.data[o + 2] = bl
+        img.data[o + 3] = 255
+      }
+    }
+    const off = document.createElement('canvas')
+    off.width = frames
+    off.height = bins
+    off.getContext('2d').putImageData(img, 0, 0)
+    ctx.imageSmoothingEnabled = true
+    ctx.drawImage(off, 0, 0, cssW, cssH)
+  }, [spectrogram, frames, bins, height])
+
+  if (!src) return null
+
+  return (
+    <div className="audio-spectrogram">
+      {label && <div className="audio-spectrogram__label">{label}</div>}
+      <div className="audio-spectrogram__canvas-wrap" style={{ height }}>
+        {error ? (
+          <div className="audio-spectrogram__error">{error}</div>
+        ) : (
+          <canvas ref={canvasRef} data-testid={testId} style={{ width: '100%', height: '100%' }} />
+        )}
+        {maxFreq > 0 && !error && (
+          <>
+            <span className="audio-spectrogram__axis audio-spectrogram__axis--top">{fmtHz(maxFreq)}</span>
+            <span className="audio-spectrogram__axis audio-spectrogram__axis--bottom">0 Hz</span>
+          </>
+        )}
+        {duration > 0 && !error && (
+          <span className="audio-spectrogram__duration">{duration.toFixed(1)}s</span>
+        )}
+        {loading && !error && <div className="audio-spectrogram__loading">Analysing…</div>}
+      </div>
+    </div>
+  )
+}
+
+function fmtHz(hz) {
+  if (hz >= 1000) return `${(hz / 1000).toFixed(hz % 1000 === 0 ? 0 : 1)} kHz`
+  return `${Math.round(hz)} Hz`
+}
+
+// magma — compact perceptual colormap (black→purple→orange→white) sampled at 8
+// control points and linearly interpolated. Perceptually uniform maps read
+// far better for spectral magnitude than a raw hue ramp. v is clamped to [0,1].
+const MAGMA = [
+  [0, 0, 4],
+  [40, 11, 84],
+  [101, 21, 110],
+  [159, 42, 99],
+  [212, 72, 66],
+  [245, 125, 21],
+  [250, 193, 39],
+  [252, 253, 191],
+]
+function magma(v) {
+  const t = v <= 0 ? 0 : v >= 1 ? 1 : v
+  const x = t * (MAGMA.length - 1)
+  const i = Math.floor(x)
+  const frac = x - i
+  const a = MAGMA[i]
+  const b = MAGMA[Math.min(i + 1, MAGMA.length - 1)]
+  return [
+    Math.round(a[0] + (b[0] - a[0]) * frac),
+    Math.round(a[1] + (b[1] - a[1]) * frac),
+    Math.round(a[2] + (b[2] - a[2]) * frac),
+  ]
+}
diff --git a/core/http/react-ui/src/hooks/useAudioPeaks.js b/core/http/react-ui/src/hooks/useAudioPeaks.js
index 31dfa05eb..88ebea2dd 100644
--- a/core/http/react-ui/src/hooks/useAudioPeaks.js
+++ b/core/http/react-ui/src/hooks/useAudioPeaks.js
@@ -5,7 +5,7 @@ import { useEffect, useState } from 'react'
 // and most browsers cap concurrent AudioContexts at ~6. Keep one alive for
 // the lifetime of the tab and reuse it across decodes.
 let sharedCtx = null
-function getSharedAudioContext() {
+export function getSharedAudioContext() {
   if (sharedCtx) return sharedCtx
   const Ctx = window.AudioContext || window.webkitAudioContext
   if (!Ctx) return null
diff --git a/core/http/react-ui/src/hooks/useSpectrogram.js b/core/http/react-ui/src/hooks/useSpectrogram.js
new file mode 100644
index 000000000..c6f2c6f9c
--- /dev/null
+++ b/core/http/react-ui/src/hooks/useSpectrogram.js
@@ -0,0 +1,107 @@
+import { useEffect, useState } from 'react'
+import { getSharedAudioContext } from './useAudioPeaks'
+import { fftRadix2 } from '../utils/fft'
+
+// Hann windows are reused across frames and across clips, so cache one per
+// size. The window tapers each frame to suppress spectral leakage (the
+// vertical smearing you'd otherwise get from hard frame edges).
+const windowCache = new Map()
+function hann(n) {
+  let w = windowCache.get(n)
+  if (w) return w
+  w = new Float32Array(n)
+  for (let i = 0; i < n; i++) w[i] = 0.5 - 0.5 * Math.cos((2 * Math.PI * i) / (n - 1))
+  windowCache.set(n, w)
+  return w
+}
+
+const EMPTY = { spectrogram: null, frames: 0, bins: 0, maxFreq: 0, duration: 0, error: null, loading: false }
+
+// useSpectrogram — decode an audio source (blob/data/http URL) and compute a
+// magnitude STFT suitable for a spectrogram heatmap. Returns
+// `{ spectrogram, frames, bins, maxFreq, duration, error, loading }` where
+// `spectrogram` is a Float32Array of `frames * bins` values, row-major by
+// frame, normalised so the dB floor maps to 0 and the loudest bin to 1.
+// `bins` spans 0..Nyquist (`maxFreq`).
+//
+// fftSize/hop default to the LocalVQE frame geometry (512/256) so the picture
+// lines up with how the model itself frames the audio. Long clips are
+// strided down to at most `maxFrames` columns — the heatmap is only a few
+// hundred px wide, so computing an FFT per native hop would be wasted work.
+export default function useSpectrogram(
+  src,
+  { fftSize = 512, hop = 256, maxFrames = 900, dbFloor = -90 } = {},
+) {
+  const [state, setState] = useState(EMPTY)
+
+  useEffect(() => {
+    setState(EMPTY)
+    if (!src) return
+    let cancelled = false
+    setState((s) => ({ ...s, loading: true }))
+
+    async function run() {
+      try {
+        const resp = await fetch(src)
+        const raw = await resp.arrayBuffer()
+        const ctx = getSharedAudioContext()
+        if (!ctx) throw new Error('Web Audio API not available')
+        const audio = await ctx.decodeAudioData(raw.slice(0))
+        if (cancelled) return
+
+        const data = audio.getChannelData(0)
+        const bins = fftSize >> 1
+        const win = hann(fftSize)
+
+        // Frame count, then a stride so we never run more than maxFrames FFTs.
+        const rawFrames = data.length >= fftSize ? 1 + Math.floor((data.length - fftSize) / hop) : 1
+        const stride = rawFrames > maxFrames ? Math.ceil(rawFrames / maxFrames) : 1
+        const frames = Math.ceil(rawFrames / stride)
+
+        const spec = new Float32Array(frames * bins)
+        const re = new Float64Array(fftSize)
+        const im = new Float64Array(fftSize)
+        let peakDb = dbFloor
+
+        for (let f = 0; f < frames; f++) {
+          const start = f * stride * hop
+          for (let i = 0; i < fftSize; i++) {
+            const s = start + i
+            re[i] = s < data.length ? data[s] * win[i] : 0
+            im[i] = 0
+          }
+          fftRadix2(re, im)
+          for (let b = 0; b < bins; b++) {
+            const mag = Math.hypot(re[b], im[b]) / fftSize
+            let db = mag > 0 ? 20 * Math.log10(mag) : dbFloor
+            if (db < dbFloor) db = dbFloor
+            spec[f * bins + b] = db
+            if (db > peakDb) peakDb = db
+          }
+        }
+
+        // Normalise dB into [0,1] against [dbFloor, peakDb].
+        const range = peakDb - dbFloor || 1
+        for (let i = 0; i < spec.length; i++) spec[i] = (spec[i] - dbFloor) / range
+
+        if (cancelled) return
+        setState({
+          spectrogram: spec,
+          frames,
+          bins,
+          maxFreq: audio.sampleRate / 2,
+          duration: audio.duration,
+          error: null,
+          loading: false,
+        })
+      } catch (e) {
+        if (!cancelled) setState((s) => ({ ...s, error: e?.message || 'Could not analyse audio', loading: false }))
+      }
+    }
+
+    run()
+    return () => { cancelled = true }
+  }, [src, fftSize, hop, maxFrames, dbFloor])
+
+  return state
+}
diff --git a/core/http/react-ui/src/pages/AudioTransform.jsx b/core/http/react-ui/src/pages/AudioTransform.jsx
index 99e3911c8..98526751d 100644
--- a/core/http/react-ui/src/pages/AudioTransform.jsx
+++ b/core/http/react-ui/src/pages/AudioTransform.jsx
@@ -5,6 +5,7 @@ import { CAP_AUDIO_TRANSFORM } from '../utils/capabilities'
 import LoadingSpinner from '../components/LoadingSpinner'
 import ErrorWithTraceLink from '../components/ErrorWithTraceLink'
 import WaveformPlayer from '../components/audio/WaveformPlayer'
+import Spectrogram from '../components/audio/Spectrogram'
 import { audioTransformApi } from '../utils/api'
 import { useMediaCapture } from '../hooks/useMediaCapture'
 import useObjectUrl from '../hooks/useObjectUrl'
@@ -261,6 +262,24 @@ export default function AudioTransform() {
             </div>
           ) : (
             <div className="audio-transform-stack">
+              {audioUrl && (
+                <div className="audio-spectrogram-pair">
+                  <Spectrogram src={audioUrl} label="Input spectrum" testId="spectrogram-input" />
+                  {outputUrl ? (
+                    <Spectrogram src={outputUrl} label="Output spectrum" testId="spectrogram-output" />
+                  ) : (
+                    <div className="audio-spectrogram">
+                      <div className="audio-spectrogram__label">Output spectrum</div>
+                      <div
+                        className="audio-spectrogram__canvas-wrap audio-spectrogram__canvas-wrap--empty"
+                        style={{ height: 140 }}
+                      >
+                        <span className="audio-spectrogram__hint">Transform to compare attenuation</span>
+                      </div>
+                    </div>
+                  )}
+                </div>
+              )}
               <WaveformPlayer src={audioUrl} label="Audio" height={96} />
               <WaveformPlayer src={referenceUrl} label="Reference" height={96} dimmed={!referenceFile} />
               {outputUrl && (
diff --git a/core/http/react-ui/src/utils/fft.js b/core/http/react-ui/src/utils/fft.js
new file mode 100644
index 000000000..35ee4cff8
--- /dev/null
+++ b/core/http/react-ui/src/utils/fft.js
@@ -0,0 +1,47 @@
+// Minimal in-place iterative radix-2 Cooley–Tukey FFT.
+//
+// The AudioTransform spectrogram only needs forward transforms of short real
+// frames (≤2048 samples), so a compact ~30-line implementation beats pulling
+// in a dependency and shipping it in the bundle. `re` and `im` are mutated in
+// place; `n = re.length` must be a power of two (the caller picks fftSize).
+export function fftRadix2(re, im) {
+  const n = re.length
+  if (n <= 1) return
+
+  // Bit-reversal permutation: reorder samples so the butterfly stage below can
+  // run in place.
+  for (let i = 1, j = 0; i < n; i++) {
+    let bit = n >> 1
+    for (; j & bit; bit >>= 1) j ^= bit
+    j ^= bit
+    if (i < j) {
+      const tr = re[i]; re[i] = re[j]; re[j] = tr
+      const ti = im[i]; im[i] = im[j]; im[j] = ti
+    }
+  }
+
+  // Butterflies, doubling the transform length each pass.
+  for (let len = 2; len <= n; len <<= 1) {
+    const half = len >> 1
+    const ang = (-2 * Math.PI) / len
+    const wpr = Math.cos(ang)
+    const wpi = Math.sin(ang)
+    for (let i = 0; i < n; i += len) {
+      let wr = 1
+      let wi = 0
+      for (let k = 0; k < half; k++) {
+        const a = i + k
+        const b = a + half
+        const tr = wr * re[b] - wi * im[b]
+        const ti = wr * im[b] + wi * re[b]
+        re[b] = re[a] - tr
+        im[b] = im[a] - ti
+        re[a] += tr
+        im[a] += ti
+        const nwr = wr * wpr - wi * wpi
+        wi = wr * wpi + wi * wpr
+        wr = nwr
+      }
+    }
+  }
+}
diff --git a/docs/content/features/audio-transform.md b/docs/content/features/audio-transform.md
index 61269b409..511b2e3d7 100644
--- a/docs/content/features/audio-transform.md
+++ b/docs/content/features/audio-transform.md
@@ -103,9 +103,11 @@ ends the session cleanly.
 
 ### Latency
 
-LocalVQE has 16 ms algorithmic latency (one hop). At runtime, ~1.66 ms of CPU
-time per frame on a modern desktop, leaving the rest of the budget for
-network and downstream playback.
+LocalVQE has 16 ms algorithmic latency (one hop). At runtime the per-frame CPU
+cost depends on the model: ~1.6 ms for the compact 1.3 M models (v1.1/v1.2,
+~9.7× realtime) and ~3.3 ms for the wider v1.3 4.8 M model (~4.7× realtime) on
+a 4-thread modern desktop, leaving the rest of the budget for network and
+downstream playback.
 
 ## Backend-specific tuning (LocalVQE)
 
@@ -120,11 +122,16 @@ A reasonable starting point is `-50` dBFS.
 
 ## Configuring a model
 
+LocalVQE ships several weight releases in the gallery: `localvqe-v1.3-4.8m`
+(current default — best quality), `localvqe-v1.2-1.3m` and `localvqe-v1.1-1.3m`
+(compact, ~¼ the per-hop cost — good for low-core or power-constrained hosts).
+All share the same backend and request API; only the `model` filename differs.
+
 ```yaml
 name: localvqe
 backend: localvqe
 parameters:
-  model: localvqe-v1.1-1.3M-f32.gguf
+  model: localvqe-v1.3-4.8M-f32.gguf
 
 # Backend-specific defaults can be set in Options[]; per-request
 # params[*] form fields override.
diff --git a/flake.nix b/flake.nix
index 9ffc94709..30f57a057 100644
--- a/flake.nix
+++ b/flake.nix
@@ -81,6 +81,11 @@
           gotools  # goimports
           go-tools # staticcheck
 
+          # Audio transforms: pkg/utils/ffmpeg_test.go shells out to the
+          # `ffmpeg` CLI, exercised by `make test-coverage` (the pre-commit
+          # gate). Headless build = the CLI without GUI/X deps.
+          ffmpeg-headless
+
           # Common dev conveniences
           git
           curl
diff --git a/gallery/index.yaml b/gallery/index.yaml
index 865eec9f0..97b0d472f 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -29903,6 +29903,68 @@
     - filename: localvqe-v1.1-1.3M-f32.gguf
       sha256: c118227c6b433d6aa36d9e4b993e0f31aa60787ea38d301d04db917a4a2b0a84
       uri: huggingface://LocalAI-io/LocalVQE/localvqe-v1.1-1.3M-f32.gguf
+- name: localvqe-v1.2-1.3m
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://github.com/localai-org/LocalVQE
+    - https://huggingface.co/LocalAI-io/LocalVQE
+  description: |
+    LocalVQE v1.2 (1.3 M parameters, F32) — compact joint acoustic echo
+    cancellation, noise suppression, and dereverberation for 16 kHz mono
+    speech. Shares the same DeepVQE-style architecture (arch_version 3) as
+    v1.3 but with narrower encoder/decoder widths, so it runs at ~9.7×
+    realtime (~1.6 ms per 16 ms frame on a 4-thread Zen4 CPU) — about ¼ the
+    per-hop cost of v1.3. Widens the echo-search window to 1024 ms (v1.1 used
+    512 ms). ~5 MB on disk. The budget-friendly choice for low-core or
+    power-constrained devices.
+  license: apache-2.0
+  icon: https://avatars.githubusercontent.com/u/260893928
+  tags:
+    - audio-transform
+    - aec
+    - acoustic-echo-cancellation
+    - noise-suppression
+    - dereverberation
+    - cpu
+  overrides:
+    backend: localvqe
+    parameters:
+      model: localvqe-v1.2-1.3M-f32.gguf
+  files:
+    - filename: localvqe-v1.2-1.3M-f32.gguf
+      sha256: 4856ecf5f522b23fb2bc5caeac81f323c0ef1c4c156a9c7d40a6adbe092ba9ce
+      uri: huggingface://LocalAI-io/LocalVQE/localvqe-v1.2-1.3M-f32.gguf
+- name: localvqe-v1.3-4.8m
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://github.com/localai-org/LocalVQE
+    - https://huggingface.co/LocalAI-io/LocalVQE
+  description: |
+    LocalVQE v1.3 (4.8 M parameters, F32) — current default release. Joint
+    acoustic echo cancellation, noise suppression, and dereverberation for
+    16 kHz mono speech, with a wider encoder/decoder trained from scratch
+    under a noise-floor-aware loss recipe. ~4.7× realtime (~3.3 ms per 16 ms
+    frame on a 4-thread Zen4 CPU); ~19 MB on disk. Improves doubletalk speech
+    quality (+0.25 deg MOS) and far-end echo cancellation (ERLE +5.2–9.3 dB)
+    over v1.2; on far-end-only scenes some users may still prefer v1.2's
+    gentler trade-off. Same 16 ms algorithmic latency as the compact models.
+  license: apache-2.0
+  icon: https://avatars.githubusercontent.com/u/260893928
+  tags:
+    - audio-transform
+    - aec
+    - acoustic-echo-cancellation
+    - noise-suppression
+    - dereverberation
+    - cpu
+  overrides:
+    backend: localvqe
+    parameters:
+      model: localvqe-v1.3-4.8M-f32.gguf
+  files:
+    - filename: localvqe-v1.3-4.8M-f32.gguf
+      sha256: c4f7912485c32cfc206c536f2f050b52513f2f613fdbc616391f6b26ab1d51ec
+      uri: huggingface://LocalAI-io/LocalVQE/localvqe-v1.3-4.8M-f32.gguf
 - name: tlacuilo-12b
   url: github:mudler/LocalAI/gallery/mistral-0.3.yaml@master
   urls:
diff --git a/pkg/audio/audio.go b/pkg/audio/audio.go
index 8a0b33586..1d3d9c17f 100644
--- a/pkg/audio/audio.go
+++ b/pkg/audio/audio.go
@@ -77,22 +77,59 @@ func NewWAVHeaderWithRate(pcmLen, sampleRate uint32) WAVHeader {
 // WAVHeaderSize is the size of a standard PCM WAV header in bytes.
 const WAVHeaderSize = 44
 
-// StripWAVHeader removes a WAV header from audio data, returning raw PCM.
-// If the data is too short to contain a header, it is returned unchanged.
+// wavDataChunk walks the RIFF sub-chunks of an in-memory WAV and returns the
+// `data` chunk payload (a sub-slice of data, not a copy) plus the sample rate
+// from `fmt `. ok is false when data isn't a RIFF/WAVE stream or carries no
+// data chunk — callers then fall back to treating the input as raw PCM.
+//
+// Walking the chunks rather than assuming the canonical 44-byte header is what
+// keeps an 18/40-byte extensible `fmt `, or JUNK/LIST/bext metadata before or
+// after `data` (e.g. ffmpeg's trailing "Lavf" tag), from being spliced into
+// the PCM as an audible click.
+func wavDataChunk(data []byte) (pcm []byte, sampleRate int, ok bool) {
+	if len(data) < 12 || string(data[0:4]) != "RIFF" || string(data[8:12]) != "WAVE" {
+		return nil, 0, false
+	}
+	for off := 12; off+8 <= len(data); {
+		id := string(data[off : off+4])
+		size := int(binary.LittleEndian.Uint32(data[off+4 : off+8]))
+		body := off + 8
+		if size < 0 || body+size > len(data) {
+			// Truncated/garbage size — clamp to what's left so a short final
+			// chunk doesn't drop an otherwise valid data chunk.
+			size = len(data) - body
+		}
+		switch id {
+		case "fmt ":
+			if size >= 16 {
+				sampleRate = int(binary.LittleEndian.Uint32(data[body+4 : body+8]))
+			}
+		case "data":
+			return data[body : body+size], sampleRate, true
+		}
+		// Chunks are word-aligned: an odd size is followed by a pad byte.
+		off = body + size + (size & 1)
+	}
+	return nil, 0, false
+}
+
+// StripWAVHeader removes a WAV header from audio data, returning raw PCM. If
+// the data isn't a recognisable WAV (e.g. it's already raw PCM) it is returned
+// unchanged. Locates the `data` chunk by walking the RIFF structure rather
+// than assuming a fixed 44-byte header — see [wavDataChunk].
 func StripWAVHeader(data []byte) []byte {
-	if len(data) > WAVHeaderSize {
-		return data[WAVHeaderSize:]
+	if pcm, _, ok := wavDataChunk(data); ok {
+		return pcm
 	}
 	return data
 }
 
-// ParseWAV strips the WAV header and returns the raw PCM along with the
-// sample rate read from the header. If the data is too short to contain a
-// valid header the PCM is returned as-is with sampleRate=0.
+// ParseWAV returns the raw PCM of a WAV's `data` chunk along with the sample
+// rate from `fmt `. If the data isn't a recognisable WAV it is returned as-is
+// with sampleRate=0. Walks the RIFF structure — see [wavDataChunk].
 func ParseWAV(data []byte) (pcm []byte, sampleRate int) {
-	if len(data) <= WAVHeaderSize {
-		return data, 0
+	if pcm, sr, ok := wavDataChunk(data); ok {
+		return pcm, sr
 	}
-	sr := int(binary.LittleEndian.Uint32(data[24:28]))
-	return data[WAVHeaderSize:], sr
+	return data, 0
 }
diff --git a/pkg/audio/audio_test.go b/pkg/audio/audio_test.go
index 836aa27ae..f13e51201 100644
--- a/pkg/audio/audio_test.go
+++ b/pkg/audio/audio_test.go
@@ -96,4 +96,81 @@ var _ = Describe("WAV utilities", func() {
 			Expect(gotPCM).To(Equal(short))
 		})
 	})
+
+	Describe("non-canonical RIFF layouts", func() {
+		// chunk builds a word-aligned RIFF sub-chunk (id + size + body + pad).
+		chunk := func(id string, body []byte) []byte {
+			out := append([]byte(id), 0, 0, 0, 0)
+			binary.LittleEndian.PutUint32(out[4:8], uint32(len(body)))
+			out = append(out, body...)
+			if len(body)&1 == 1 {
+				out = append(out, 0) // pad byte for odd-sized chunks
+			}
+			return out
+		}
+		// fmtBody is a mono 16-bit PCM `fmt ` body; extra simulates the
+		// 18/40-byte extensible form (cbSize + extension).
+		fmtBody := func(rate uint32, extra int) []byte {
+			b := make([]byte, 16+extra)
+			binary.LittleEndian.PutUint16(b[0:2], 1)       // PCM
+			binary.LittleEndian.PutUint16(b[2:4], 1)       // mono
+			binary.LittleEndian.PutUint32(b[4:8], rate)    // sample rate
+			binary.LittleEndian.PutUint32(b[8:12], rate*2) // byte rate
+			binary.LittleEndian.PutUint16(b[12:14], 2)     // block align
+			binary.LittleEndian.PutUint16(b[14:16], 16)    // bits per sample
+			if extra >= 2 {
+				binary.LittleEndian.PutUint16(b[16:18], uint16(extra-2)) // cbSize
+			}
+			return b
+		}
+		riff := func(chunks ...[]byte) []byte {
+			body := []byte("WAVE")
+			for _, c := range chunks {
+				body = append(body, c...)
+			}
+			out := append([]byte("RIFF"), 0, 0, 0, 0)
+			binary.LittleEndian.PutUint32(out[4:8], uint32(len(body)))
+			return append(out, body...)
+		}
+		pcm := []byte{1, 2, 3, 4, 5, 6, 7, 8}
+
+		It("ignores JUNK/LIST chunks before data (no leading splice)", func() {
+			w := riff(
+				chunk("fmt ", fmtBody(16000, 0)),
+				chunk("JUNK", []byte("padding-bytes-x")), // odd length → exercises pad
+				chunk("LIST", []byte("INFOISFTLavf")),
+				chunk("data", pcm),
+			)
+			gotPCM, rate := ParseWAV(w)
+			Expect(rate).To(Equal(16000))
+			Expect(gotPCM).To(Equal(pcm))
+			Expect(StripWAVHeader(w)).To(Equal(pcm))
+		})
+
+		It("honours the data chunk size and drops a trailing chunk", func() {
+			w := riff(
+				chunk("fmt ", fmtBody(24000, 0)),
+				chunk("data", pcm),
+				chunk("LIST", []byte("INFOISFTLavf60.16")), // ffmpeg trailer tag
+			)
+			gotPCM, rate := ParseWAV(w)
+			Expect(rate).To(Equal(24000))
+			Expect(gotPCM).To(Equal(pcm)) // trailing LIST not spliced in
+		})
+
+		It("handles an 18-byte extensible fmt chunk", func() {
+			w := riff(chunk("fmt ", fmtBody(16000, 2)), chunk("data", pcm))
+			gotPCM, rate := ParseWAV(w)
+			Expect(rate).To(Equal(16000))
+			Expect(gotPCM).To(Equal(pcm))
+		})
+
+		It("returns non-WAV input unchanged", func() {
+			raw := []byte("this is definitely not a riff wave file")
+			gotPCM, rate := ParseWAV(raw)
+			Expect(rate).To(Equal(0))
+			Expect(gotPCM).To(Equal(raw))
+			Expect(StripWAVHeader(raw)).To(Equal(raw))
+		})
+	})
 })
diff --git a/scripts/ui-coverage-check.sh b/scripts/ui-coverage-check.sh
index 54532b48d..33a43748c 100755
--- a/scripts/ui-coverage-check.sh
+++ b/scripts/ui-coverage-check.sh
@@ -4,17 +4,20 @@
 #
 # Compares the total line coverage in an nyc coverage-summary.json against a
 # committed baseline and fails (exit 1) if it dropped by more than
-# UI_COVERAGE_TOLERANCE percentage points (default 0.8). The React UI e2e suite
+# UI_COVERAGE_TOLERANCE percentage points (default 0.1). The React UI e2e suite
 # drives the real app, so a removed feature or deleted spec shows up as a
 # coverage drop here.
 #
-# UI e2e line coverage is NOT deterministic: async/debounced paths (e.g. the
-# VRAM estimate's 500ms debounce) mean identical specs vary run-to-run. With the
-# V8 path's single-chunk coverage build (vite.config.js inlineDynamicImports)
-# the observed wobble is ~0.5pp, similar to the old istanbul path. The tolerance
-# absorbs that jitter — keep it just above the observed wobble so a real ~1pp
-# regression still trips the gate.
-# (The Go gate carries a smaller tolerance for the same reason — its e2e slice.)
+# The tolerance exists only to absorb the irreducible measurement noise floor,
+# NOT to permit regression. UI e2e coverage USED to swing ~1pp run-to-run, which
+# forced a loose 0.8pp band — but that swing was a bug, not inherent jitter: a
+# spec that navigated to a route and ended on the URL assertion let the target
+# component's render race the coverage teardown, so ~400 lines were collected
+# only when the render won (see e2e/agents.spec.js → AgentCreate). With that race
+# fixed, repeated runs land within ~0.013pp (a handful of lines) of each other,
+# so the band is tightened to 0.1pp — enough for the noise floor, tight enough
+# that a real ~40-line regression still trips the gate. If a future run wobbles
+# more, fix the racing spec (await a rendered element) rather than loosening this.
 #
 # When coverage rises meaningfully, regenerate and commit the baseline with:
 #   make test-ui-coverage-baseline
@@ -22,7 +25,7 @@ set -eu
 
 summary="${1:?usage: ui-coverage-check.sh SUMMARY_JSON BASELINE_FILE}"
 baseline_file="${2:?usage: ui-coverage-check.sh SUMMARY_JSON BASELINE_FILE}"
-tolerance="${UI_COVERAGE_TOLERANCE:-0.8}"
+tolerance="${UI_COVERAGE_TOLERANCE:-0.1}"
 
 if [ ! -f "$summary" ]; then
 	echo "ui-coverage-check: coverage summary not found: $summary" >&2