diff --git a/backend/go/localvqe/Makefile b/backend/go/localvqe/Makefile index b607288fc..7b66e9371 100644 --- a/backend/go/localvqe/Makefile +++ b/backend/go/localvqe/Makefile @@ -9,7 +9,7 @@ JOBS?=$(shell nproc --ignore=1) # LocalVQE upstream version pin. Bump to a specific commit when picking up # a new release; `main` works for development but is not reproducible. LOCALVQE_REPO?=https://github.com/localai-org/LocalVQE -LOCALVQE_VERSION?=72bfb4c6 +LOCALVQE_VERSION?=b0f0378a450e87c871b85689554801601ca56d98 # LocalVQE handles CPU feature selection internally (it ships the multiple # libggml-cpu-*.so variants and its loader picks the best one at runtime @@ -27,7 +27,8 @@ endif # LocalVQE upstream supports CPU + Vulkan only. Other BUILD_TYPE values # fall through to the default CPU build — Vulkan is already as fast as the -# specialised GPU paths would be on this 1.3 M-parameter model. +# specialised GPU paths would be on these small (1.3 M–4.8 M parameter) +# models. ifeq ($(BUILD_TYPE),vulkan) CMAKE_ARGS+=-DGGML_VULKAN=ON -DLOCALVQE_VULKAN=ON else ifeq ($(OS),Darwin) diff --git a/backend/go/localvqe/golocalvqe.go b/backend/go/localvqe/golocalvqe.go index b0575c3be..5d7c862d5 100644 --- a/backend/go/localvqe/golocalvqe.go +++ b/backend/go/localvqe/golocalvqe.go @@ -3,7 +3,6 @@ package main import ( "encoding/binary" "fmt" - "io" "os" "path/filepath" "runtime" @@ -11,6 +10,7 @@ import ( "strings" "unsafe" + "github.com/go-audio/wav" "github.com/mudler/LocalAI/pkg/grpc/base" pb "github.com/mudler/LocalAI/pkg/grpc/proto" "github.com/mudler/xlog" @@ -46,24 +46,24 @@ const ( // through the options builder (CppOptionsNew + setters + CppNewWithOptions) // — the bare localvqe_new path doesn't expose backend / device selection. var ( - CppOptionsNew func() uintptr - CppOptionsFree func(opts uintptr) - CppOptionsSetModelPath func(opts uintptr, modelPath string) int32 - CppOptionsSetBackend func(opts uintptr, backend string) int32 - CppOptionsSetDevice func(opts uintptr, device int32) int32 - CppNewWithOptions func(opts uintptr) uintptr - CppFree func(ctx uintptr) - CppProcessF32 func(ctx uintptr, mic, ref uintptr, nSamples int32, out uintptr) int32 - CppProcessS16 func(ctx uintptr, mic, ref uintptr, nSamples int32, out uintptr) int32 - CppProcessFrameF32 func(ctx uintptr, mic, ref uintptr, hopSamples int32, out uintptr) int32 - CppProcessFrameS16 func(ctx uintptr, mic, ref uintptr, hopSamples int32, out uintptr) int32 - CppReset func(ctx uintptr) - CppLastError func(ctx uintptr) string - CppSampleRate func(ctx uintptr) int32 - CppHopLength func(ctx uintptr) int32 - CppFFTSize func(ctx uintptr) int32 - CppSetNoiseGate func(ctx uintptr, enabled int32, thresholdDBFS float32) int32 - CppGetNoiseGate func(ctx uintptr, enabledOut, thresholdDBFSOut uintptr) int32 + CppOptionsNew func() uintptr + CppOptionsFree func(opts uintptr) + CppOptionsSetModelPath func(opts uintptr, modelPath string) int32 + CppOptionsSetBackend func(opts uintptr, backend string) int32 + CppOptionsSetDevice func(opts uintptr, device int32) int32 + CppNewWithOptions func(opts uintptr) uintptr + CppFree func(ctx uintptr) + CppProcessF32 func(ctx uintptr, mic, ref uintptr, nSamples int32, out uintptr) int32 + CppProcessS16 func(ctx uintptr, mic, ref uintptr, nSamples int32, out uintptr) int32 + CppProcessFrameF32 func(ctx uintptr, mic, ref uintptr, hopSamples int32, out uintptr) int32 + CppProcessFrameS16 func(ctx uintptr, mic, ref uintptr, hopSamples int32, out uintptr) int32 + CppReset func(ctx uintptr) + CppLastError func(ctx uintptr) string + CppSampleRate func(ctx uintptr) int32 + CppHopLength func(ctx uintptr) int32 + CppFFTSize func(ctx uintptr) int32 + CppSetNoiseGate func(ctx uintptr, enabled int32, thresholdDBFS float32) int32 + CppGetNoiseGate func(ctx uintptr, enabledOut, thresholdDBFSOut uintptr) int32 ) // LocalVQE speaks gRPC against LocalVQE's flat C ABI. The streaming @@ -490,11 +490,14 @@ func (v *LocalVQE) applyStreamConfig(cfg *pb.AudioTransformStreamConfig) error { // ---- WAV I/O ---------------------------------------------------------- // -// Minimal mono PCM WAV reader/writer. Only handles the subset LocalVQE -// cares about (mono, 16-bit signed, no extensible chunks). For broader -// audio support the HTTP layer's `audio.NormalizeAudioFile` already -// converts arbitrary input to a canonical WAV before we see it; this -// reader just decodes the canonical shape. +// Reader/writer for the mono 16-bit PCM shape LocalVQE works with. Decoding +// goes through the shared go-audio/wav decoder (as the whisper and parakeet +// backends do) so RIFF chunk walking is handled robustly — an 18/40-byte +// extensible `fmt ` chunk, or JUNK/bext/LIST metadata before or after `data` +// (e.g. ffmpeg's trailing "Lavf" tag), is skipped rather than spliced into +// the PCM stream as an audible click. The HTTP layer normalises arbitrary +// input to WAV before we see it, but that WAV is ffmpeg output and is not +// guaranteed to be the canonical 44-byte layout. func readMonoWAVf32(path string) ([]float32, int, error) { f, err := os.Open(path) @@ -502,35 +505,26 @@ func readMonoWAVf32(path string) ([]float32, int, error) { return nil, 0, err } defer func() { _ = f.Close() }() - header := make([]byte, 44) - if _, err := io.ReadFull(f, header); err != nil { - return nil, 0, err + + buf, err := wav.NewDecoder(f).FullPCMBuffer() + if err != nil { + return nil, 0, fmt.Errorf("decode WAV: %w", err) } - if string(header[0:4]) != "RIFF" || string(header[8:12]) != "WAVE" { + if buf == nil || buf.Format == nil { return nil, 0, fmt.Errorf("not a WAV file") } - channels := binary.LittleEndian.Uint16(header[22:24]) - sampleRate := binary.LittleEndian.Uint32(header[24:28]) - bitsPerSample := binary.LittleEndian.Uint16(header[34:36]) - - if channels != 1 { - return nil, 0, fmt.Errorf("only mono WAV supported (got %d channels)", channels) + if buf.Format.NumChannels != 1 { + return nil, 0, fmt.Errorf("only mono WAV supported (got %d channels)", buf.Format.NumChannels) } - if bitsPerSample != 16 { - return nil, 0, fmt.Errorf("only 16-bit PCM supported (got %d bits)", bitsPerSample) + if buf.SourceBitDepth != 16 { + return nil, 0, fmt.Errorf("only 16-bit PCM supported (got %d bits)", buf.SourceBitDepth) } - - rest, err := io.ReadAll(f) - if err != nil { - return nil, 0, err + if len(buf.Data) == 0 { + return nil, 0, fmt.Errorf("WAV has no audio data") } - n := len(rest) / 2 - out := make([]float32, n) - for i := 0; i < n; i++ { - s := int16(binary.LittleEndian.Uint16(rest[i*2 : i*2+2])) - out[i] = float32(s) / 32768.0 - } - return out, int(sampleRate), nil + // AsFloat32Buffer normalises by 2^(bitDepth-1) == /32768 for 16-bit, + // matching the model's expected [-1, 1) input range. + return buf.AsFloat32Buffer().Data, buf.Format.SampleRate, nil } func writeMonoWAVf32(path string, samples []float32, sampleRate int) error { @@ -546,13 +540,13 @@ func writeMonoWAVf32(path string, samples []float32, sampleRate int) error { binary.LittleEndian.PutUint32(header[4:8], 36+dataLen) copy(header[8:12], []byte("WAVE")) copy(header[12:16], []byte("fmt ")) - binary.LittleEndian.PutUint32(header[16:20], 16) // fmt chunk size - binary.LittleEndian.PutUint16(header[20:22], 1) // PCM - binary.LittleEndian.PutUint16(header[22:24], 1) // mono + binary.LittleEndian.PutUint32(header[16:20], 16) // fmt chunk size + binary.LittleEndian.PutUint16(header[20:22], 1) // PCM + binary.LittleEndian.PutUint16(header[22:24], 1) // mono binary.LittleEndian.PutUint32(header[24:28], uint32(sampleRate)) binary.LittleEndian.PutUint32(header[28:32], uint32(sampleRate*2)) // byte rate - binary.LittleEndian.PutUint16(header[32:34], 2) // block align - binary.LittleEndian.PutUint16(header[34:36], 16) // bits per sample + binary.LittleEndian.PutUint16(header[32:34], 2) // block align + binary.LittleEndian.PutUint16(header[34:36], 16) // bits per sample copy(header[36:40], []byte("data")) binary.LittleEndian.PutUint32(header[40:44], dataLen) if _, err := f.Write(header); err != nil { diff --git a/backend/go/localvqe/localvqe_test.go b/backend/go/localvqe/localvqe_test.go index 5053dfeb1..60541441e 100644 --- a/backend/go/localvqe/localvqe_test.go +++ b/backend/go/localvqe/localvqe_test.go @@ -1,7 +1,9 @@ package main import ( + "encoding/binary" "os" + "path/filepath" "testing" pb "github.com/mudler/LocalAI/pkg/grpc/proto" @@ -92,6 +94,147 @@ var _ = Describe("LocalVQE-cpp", func() { }) }) + Context("readMonoWAVf32 chunk parsing", func() { + // chunk builds a word-aligned RIFF sub-chunk (id + size + body + pad). + chunk := func(id string, body []byte) []byte { + out := append([]byte(id), 0, 0, 0, 0) + binary.LittleEndian.PutUint32(out[4:8], uint32(len(body))) + out = append(out, body...) + if len(body)&1 == 1 { + out = append(out, 0) // pad byte for odd-sized chunks + } + return out + } + // fmtBody returns a PCM `fmt ` chunk body. extra bytes simulate the + // 18/40-byte extensible form (cbSize + extension). + fmtBody := func(channels, bits uint16, rate uint32, extra int) []byte { + b := make([]byte, 16+extra) + binary.LittleEndian.PutUint16(b[0:2], 1) // PCM + binary.LittleEndian.PutUint16(b[2:4], channels) + binary.LittleEndian.PutUint32(b[4:8], rate) + binary.LittleEndian.PutUint32(b[8:12], rate*uint32(channels)*uint32(bits)/8) + binary.LittleEndian.PutUint16(b[12:14], channels*bits/8) + binary.LittleEndian.PutUint16(b[14:16], bits) + if extra >= 2 { + binary.LittleEndian.PutUint16(b[16:18], uint16(extra-2)) // cbSize + } + return b + } + // pcm encodes int16 samples little-endian. + pcm := func(samples ...int16) []byte { + b := make([]byte, len(samples)*2) + for i, s := range samples { + binary.LittleEndian.PutUint16(b[i*2:i*2+2], uint16(s)) + } + return b + } + riff := func(chunks ...[]byte) []byte { + body := []byte("WAVE") + for _, c := range chunks { + body = append(body, c...) + } + out := append([]byte("RIFF"), 0, 0, 0, 0) + binary.LittleEndian.PutUint32(out[4:8], uint32(len(body))) + return append(out, body...) + } + writeWAV := func(b []byte) string { + p := filepath.Join(GinkgoT().TempDir(), "in.wav") + Expect(os.WriteFile(p, b, 0o600)).To(Succeed()) + return p + } + // A canonical sample run with distinct values so any off-by-one / + // misalignment shows up as wrong numbers, not just wrong length. + samples := []int16{1000, -2000, 3000, -4000, 5000, -6000} + expectSamples := func(got []float32) { + Expect(got).To(HaveLen(len(samples))) + for i, s := range samples { + Expect(got[i]).To(BeNumerically("~", float32(s)/32768.0, 1e-6)) + } + } + + It("reads a canonical 44-byte WAV", func() { + p := writeWAV(riff(chunk("fmt ", fmtBody(1, 16, 16000, 0)), chunk("data", pcm(samples...)))) + out, sr, err := readMonoWAVf32(p) + Expect(err).ToNot(HaveOccurred()) + Expect(sr).To(Equal(16000)) + expectSamples(out) + }) + + It("ignores a LIST/JUNK chunk placed before data (no leading-impulse splice)", func() { + p := writeWAV(riff( + chunk("fmt ", fmtBody(1, 16, 16000, 0)), + chunk("JUNK", []byte("padding-bytes-here!")), // odd length → exercises pad + chunk("LIST", []byte("INFOISFTLavf60.0")), + chunk("data", pcm(samples...)), + )) + out, sr, err := readMonoWAVf32(p) + Expect(err).ToNot(HaveOccurred()) + Expect(sr).To(Equal(16000)) + expectSamples(out) // not corrupted by the preceding chunks + }) + + It("honours the data chunk size and drops a trailing metadata chunk", func() { + p := writeWAV(riff( + chunk("fmt ", fmtBody(1, 16, 16000, 0)), + chunk("data", pcm(samples...)), + chunk("LIST", []byte("INFOISFTLavf60.16.100")), // ffmpeg trailer tag + )) + out, _, err := readMonoWAVf32(p) + Expect(err).ToNot(HaveOccurred()) + expectSamples(out) // trailing LIST bytes not decoded as PCM + }) + + It("handles the 18-byte extensible fmt chunk", func() { + p := writeWAV(riff(chunk("fmt ", fmtBody(1, 16, 16000, 2)), chunk("data", pcm(samples...)))) + out, sr, err := readMonoWAVf32(p) + Expect(err).ToNot(HaveOccurred()) + Expect(sr).To(Equal(16000)) + expectSamples(out) + }) + + It("rejects non-mono input", func() { + p := writeWAV(riff(chunk("fmt ", fmtBody(2, 16, 16000, 0)), chunk("data", pcm(samples...)))) + _, _, err := readMonoWAVf32(p) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("mono")) + }) + + It("rejects non-16-bit input", func() { + p := writeWAV(riff(chunk("fmt ", fmtBody(1, 8, 16000, 0)), chunk("data", pcm(samples...)))) + _, _, err := readMonoWAVf32(p) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("16-bit")) + }) + + It("rejects a non-WAV file", func() { + p := writeWAV([]byte("not a riff file at all")) + _, _, err := readMonoWAVf32(p) + Expect(err).To(HaveOccurred()) + }) + + It("errors when the data chunk is missing", func() { + // fmt but no data: the decoder must fail rather than return an + // empty (or garbage) sample slice. The exact message is the + // decoder's, so just assert it errors. + p := writeWAV(riff(chunk("fmt ", fmtBody(1, 16, 16000, 0)))) + _, _, err := readMonoWAVf32(p) + Expect(err).To(HaveOccurred()) + }) + + It("round-trips through writeMonoWAVf32", func() { + p := filepath.Join(GinkgoT().TempDir(), "rt.wav") + in := []float32{0.1, -0.2, 0.3, -0.4} + Expect(writeMonoWAVf32(p, in, 16000)).To(Succeed()) + out, sr, err := readMonoWAVf32(p) + Expect(err).ToNot(HaveOccurred()) + Expect(sr).To(Equal(16000)) + Expect(out).To(HaveLen(len(in))) + for i := range in { + Expect(out[i]).To(BeNumerically("~", in[i], 1e-4)) + } + }) + }) + Context("model-gated integration (LOCALVQE_MODEL_PATH)", func() { It("load + sample rate + hop + fft", func() { path := modelPathOrSkip() diff --git a/core/http/middleware/security_headers.go b/core/http/middleware/security_headers.go index 9a3ae8d48..969a335a0 100644 --- a/core/http/middleware/security_headers.go +++ b/core/http/middleware/security_headers.go @@ -17,7 +17,10 @@ func SecurityHeaders() echo.MiddlewareFunc { "img-src 'self' data: blob: https:; " + "media-src 'self' data: blob:; " + "font-src 'self' data:; " + - "connect-src 'self' ws: wss: https:; " + + // blob: lets the waveform renderer XHR/fetch a freshly-created object + // URL (e.g. an uploaded clip before it has a server URL). XHR/fetch of + // blob: falls under connect-src, not media-src. + "connect-src 'self' ws: wss: https: blob:; " + "frame-src 'self' blob:; " + "worker-src 'self' blob:; " + "object-src 'none'; " + diff --git a/core/http/middleware/security_headers_test.go b/core/http/middleware/security_headers_test.go index af43822ea..76430b093 100644 --- a/core/http/middleware/security_headers_test.go +++ b/core/http/middleware/security_headers_test.go @@ -32,6 +32,9 @@ var _ = Describe("SecurityHeaders", func() { Expect(csp).To(ContainSubstring("frame-ancestors 'self'")) Expect(csp).To(ContainSubstring("object-src 'none'")) Expect(csp).To(ContainSubstring("base-uri 'self'")) + // blob: must be in connect-src so the waveform renderer can XHR/fetch + // a freshly-created object URL (uploaded/enhanced clip). + Expect(csp).To(ContainSubstring("connect-src 'self' ws: wss: https: blob:")) }) It("sets X-Content-Type-Options: nosniff", func() { diff --git a/core/http/react-ui/coverage-baseline.txt b/core/http/react-ui/coverage-baseline.txt index 4f8b89534..b4be1a3b7 100644 --- a/core/http/react-ui/coverage-baseline.txt +++ b/core/http/react-ui/coverage-baseline.txt @@ -1 +1 @@ -38.29 \ No newline at end of file +39.86 \ No newline at end of file diff --git a/core/http/react-ui/e2e/agents.spec.js b/core/http/react-ui/e2e/agents.spec.js index 40fe2d99c..ebfebd153 100644 --- a/core/http/react-ui/e2e/agents.spec.js +++ b/core/http/react-ui/e2e/agents.spec.js @@ -20,5 +20,10 @@ test.describe('Agents page', () => { page.waitForURL(/\/app\/agents\/new$/), create.click(), ]) + // Wait for AgentCreate.jsx to actually render, not just for the URL to + // change. Ending the test the instant the route matched let the component + // mount race the coverage teardown — its ~400 lines were collected only + // when the render won, swinging total UI coverage ~1pp run-to-run. + await expect(page.getByRole('heading', { name: 'Create Agent' })).toBeVisible() }) }) diff --git a/core/http/react-ui/e2e/audio-transform.spec.js b/core/http/react-ui/e2e/audio-transform.spec.js index 53fbeabe7..c428e95f8 100644 --- a/core/http/react-ui/e2e/audio-transform.spec.js +++ b/core/http/react-ui/e2e/audio-transform.spec.js @@ -66,6 +66,33 @@ function makeFakeWav(name) { return { name, mimeType: 'audio/wav', buffer: buf } } +// Build a WAV carrying a real sine tone, long enough that the spectrogram +// STFT produces several frames (a few thousand samples). Used to exercise the +// FFT / heatmap path, which the 4-sample silent fixture can't. +function makeToneWav(name, freq = 1000, seconds = 0.4, sampleRate = 16000) { + const samples = Math.floor(seconds * sampleRate) + const dataLen = samples * 2 + const buf = Buffer.alloc(44 + dataLen) + buf.write('RIFF', 0) + buf.writeUInt32LE(36 + dataLen, 4) + buf.write('WAVE', 8) + buf.write('fmt ', 12) + buf.writeUInt32LE(16, 16) + buf.writeUInt16LE(1, 20) + buf.writeUInt16LE(1, 22) + buf.writeUInt32LE(sampleRate, 24) + buf.writeUInt32LE(sampleRate * 2, 28) + buf.writeUInt16LE(2, 32) + buf.writeUInt16LE(16, 34) + buf.write('data', 36) + buf.writeUInt32LE(dataLen, 40) + for (let i = 0; i < samples; i++) { + const v = Math.round(Math.sin((2 * Math.PI * freq * i) / sampleRate) * 16000) + buf.writeInt16LE(v, 44 + i * 2) + } + return { name, mimeType: 'audio/wav', buffer: buf } +} + test.describe('Audio Transform', () => { test.beforeEach(async ({ page }) => { await mockCapabilities(page, [ @@ -169,6 +196,26 @@ test.describe('Audio Transform', () => { await expect(page.getByTestId('media-history-item')).toHaveCount(1) }) + test('renders an input spectrogram on upload and an output one after transform', async ({ page }) => { + mockAudioTransform(page, 'enhanced.wav') + + await page.goto('/app/transform') + await expect(page.getByRole('button', { name: 'localvqe' })).toBeVisible({ timeout: 10_000 }) + + // Choosing a clip should render its input spectrogram immediately — no + // backend round-trip needed (it's computed client-side from the bytes). + await page.locator('input[type="file"]').first().setInputFiles(makeToneWav('tone.wav')) + await expect(page.getByTestId('spectrogram-input')).toBeVisible({ timeout: 10_000 }) + + // Until a transform runs the output side shows a "compare" placeholder. + await expect(page.getByText(/Transform to compare/)).toBeVisible() + + await page.getByRole('button', { name: /Transform/ }).last().click() + + // After processing, the output spectrum panel appears alongside the input. + await expect(page.getByText('Output spectrum')).toBeVisible({ timeout: 10_000 }) + }) + test('shows an error banner when the backend returns 4xx', async ({ page }) => { await page.route('**/audio/transformations', (route) => { if (route.request().method() !== 'POST') return route.continue() diff --git a/core/http/react-ui/src/App.css b/core/http/react-ui/src/App.css index a71eda5f5..01528ed7b 100644 --- a/core/http/react-ui/src/App.css +++ b/core/http/react-ui/src/App.css @@ -6984,6 +6984,88 @@ select.input { color: var(--color-primary); } +/* Spectrogram (AudioTransform spectral view) */ +.audio-spectrogram-pair { + display: grid; + grid-template-columns: 1fr 1fr; + gap: var(--spacing-md); +} +@media (max-width: 720px) { + .audio-spectrogram-pair { + grid-template-columns: 1fr; + } +} +.audio-spectrogram { + display: flex; + flex-direction: column; + gap: var(--spacing-xs); + width: 100%; + min-width: 0; +} +.audio-spectrogram__label { + font-size: var(--text-sm); + color: var(--color-text-secondary); +} +.audio-spectrogram__canvas-wrap { + position: relative; + width: 100%; + background: var(--color-surface-sunken); + border: 1px solid var(--color-border-subtle); + border-radius: var(--radius-md); + overflow: hidden; +} +.audio-spectrogram__canvas-wrap--empty { + display: flex; + align-items: center; + justify-content: center; +} +.audio-spectrogram__hint { + color: var(--color-text-muted); + font-size: var(--text-sm); +} +.audio-spectrogram__loading { + position: absolute; + inset: 0; + display: flex; + align-items: center; + justify-content: center; + color: var(--color-text-muted); + font-size: var(--text-sm); +} +.audio-spectrogram__error { + padding: var(--spacing-md); + color: var(--color-error); + font-size: var(--text-sm); +} +.audio-spectrogram__axis { + position: absolute; + left: 6px; + font-size: 10px; + color: var(--color-text-muted); + background: var(--color-bg-overlay); + padding: 0 4px; + border-radius: var(--radius-sm); + pointer-events: none; + font-variant-numeric: tabular-nums; +} +.audio-spectrogram__axis--top { + top: 4px; +} +.audio-spectrogram__axis--bottom { + bottom: 4px; +} +.audio-spectrogram__duration { + position: absolute; + right: 8px; + bottom: 6px; + font-size: 11px; + color: var(--color-text-muted); + font-variant-numeric: tabular-nums; + background: var(--color-bg-overlay); + padding: 1px 6px; + border-radius: var(--radius-sm); +} + /* Audio Transform Studio tab */ .audio-transform-stack { display: flex; diff --git a/core/http/react-ui/src/components/audio/Spectrogram.jsx b/core/http/react-ui/src/components/audio/Spectrogram.jsx new file mode 100644 index 000000000..98907611c --- /dev/null +++ b/core/http/react-ui/src/components/audio/Spectrogram.jsx @@ -0,0 +1,105 @@ +import { useEffect, useRef } from 'react' +import useSpectrogram from '../../hooks/useSpectrogram' + +// Spectrogram — canvas heatmap of a clip's magnitude STFT (time × frequency). +// Time runs left→right, frequency low→high bottom→top, brighter = more energy. +// Used on the AudioTransform page to show input next to output so the user can +// see which bands the model attenuates (dark gaps that were bright in the +// input). Mirrors WaveformPlayer's canvas/label/overlay structure. +export default function Spectrogram({ src, label, height = 140, testId }) { + const canvasRef = useRef(null) + const { spectrogram, frames, bins, maxFreq, duration, error, loading } = useSpectrogram(src) + + useEffect(() => { + const canvas = canvasRef.current + if (!canvas) return + const dpr = window.devicePixelRatio || 1 + const cssW = canvas.clientWidth + const cssH = height + canvas.width = Math.floor(cssW * dpr) + canvas.height = Math.floor(cssH * dpr) + const ctx = canvas.getContext('2d') + ctx.setTransform(dpr, 0, 0, dpr, 0, 0) + ctx.clearRect(0, 0, cssW, cssH) + if (!spectrogram || !frames || !bins) return + + // Paint at native (frames × bins) resolution into an offscreen canvas, + // then let drawImage smooth-scale it up — far cheaper than filling + // cssW×cssH rects, and the GPU handles the interpolation. + const img = ctx.createImageData(frames, bins) + for (let f = 0; f < frames; f++) { + for (let b = 0; b < bins; b++) { + const [r, g, bl] = magma(spectrogram[f * bins + b]) + // Flip the frequency axis: image row 0 is the top = highest freq. + const o = ((bins - 1 - b) * frames + f) * 4 + img.data[o] = r + img.data[o + 1] = g + img.data[o + 2] = bl + img.data[o + 3] = 255 + } + } + const off = document.createElement('canvas') + off.width = frames + off.height = bins + off.getContext('2d').putImageData(img, 0, 0) + ctx.imageSmoothingEnabled = true + ctx.drawImage(off, 0, 0, cssW, cssH) + }, [spectrogram, frames, bins, height]) + + if (!src) return null + + return ( +