Files
LocalAI/core/http/static/talk.js
Richard Palethorpe f9a850c02a feat(realtime): WebRTC support (#8790)
* feat(realtime): WebRTC support

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(tracing): Show full LLM opts and deltas

Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-03-13 21:37:15 +01:00

607 lines
19 KiB
JavaScript

const connectButton = document.getElementById('connectButton');
const disconnectButton = document.getElementById('disconnectButton');
const testToneButton = document.getElementById('testToneButton');
const diagnosticsButton = document.getElementById('diagnosticsButton');
const audioPlayback = document.getElementById('audioPlayback');
const transcript = document.getElementById('transcript');
const statusIcon = document.getElementById('statusIcon');
const statusLabel = document.getElementById('statusLabel');
const connectionStatus = document.getElementById('connectionStatus');
const modelSelect = document.getElementById('modelSelect');
let pc = null;
let dc = null;
let localStream = null;
let hasError = false;
// Audio diagnostics state
let audioCtx = null;
let analyser = null;
let diagAnimFrame = null;
let statsInterval = null;
let diagVisible = false;
connectButton.addEventListener('click', connect);
disconnectButton.addEventListener('click', disconnect);
testToneButton.addEventListener('click', sendTestTone);
diagnosticsButton.addEventListener('click', toggleDiagnostics);
// Show pipeline details when a model is selected
modelSelect.addEventListener('change', function() {
const opt = this.options[this.selectedIndex];
const details = document.getElementById('pipelineDetails');
if (!opt || !opt.value) {
details.classList.add('hidden');
return;
}
document.getElementById('pipelineVAD').textContent = opt.dataset.vad || '--';
document.getElementById('pipelineSTT').textContent = opt.dataset.stt || '--';
document.getElementById('pipelineLLM').textContent = opt.dataset.llm || '--';
document.getElementById('pipelineTTS').textContent = opt.dataset.tts || '--';
details.classList.remove('hidden');
// Pre-fill voice from model default if the user hasn't typed anything
const voiceInput = document.getElementById('voiceInput');
if (!voiceInput.dataset.userEdited) {
voiceInput.value = opt.dataset.voice || '';
}
});
// Track if user manually edited the voice field
document.getElementById('voiceInput').addEventListener('input', function() {
this.dataset.userEdited = 'true';
});
// Auto-select first model on page load
if (modelSelect.options.length > 1) {
modelSelect.selectedIndex = 1;
modelSelect.dispatchEvent(new Event('change'));
}
function getModel() {
return modelSelect.value;
}
function setStatus(state, text) {
statusLabel.textContent = text || state;
statusIcon.className = 'fa-solid fa-circle';
connectionStatus.className = 'rounded-lg p-4 mb-4 flex items-center space-x-3';
switch (state) {
case 'disconnected':
statusIcon.classList.add('text-[var(--color-text-secondary)]');
connectionStatus.classList.add('bg-[var(--color-bg-primary)]/50', 'border', 'border-[var(--color-border-subtle)]');
statusLabel.classList.add('text-[var(--color-text-secondary)]');
break;
case 'connecting':
statusIcon.className = 'fa-solid fa-spinner fa-spin text-[var(--color-primary)]';
connectionStatus.classList.add('bg-[var(--color-primary-light)]', 'border', 'border-[var(--color-primary)]/30');
statusLabel.className = 'font-medium text-[var(--color-primary)]';
break;
case 'connected':
statusIcon.classList.add('text-[var(--color-success)]');
connectionStatus.classList.add('bg-[var(--color-success)]/10', 'border', 'border-[var(--color-success)]/30');
statusLabel.className = 'font-medium text-[var(--color-success)]';
break;
case 'listening':
statusIcon.className = 'fa-solid fa-microphone text-[var(--color-success)]';
connectionStatus.classList.add('bg-[var(--color-success)]/10', 'border', 'border-[var(--color-success)]/30');
statusLabel.className = 'font-medium text-[var(--color-success)]';
break;
case 'thinking':
statusIcon.className = 'fa-solid fa-brain fa-beat text-[var(--color-primary)]';
connectionStatus.classList.add('bg-[var(--color-primary-light)]', 'border', 'border-[var(--color-primary)]/30');
statusLabel.className = 'font-medium text-[var(--color-primary)]';
break;
case 'speaking':
statusIcon.className = 'fa-solid fa-volume-high fa-beat-fade text-[var(--color-accent)]';
connectionStatus.classList.add('bg-[var(--color-accent)]/10', 'border', 'border-[var(--color-accent)]/30');
statusLabel.className = 'font-medium text-[var(--color-accent)]';
break;
case 'error':
statusIcon.classList.add('text-[var(--color-error)]');
connectionStatus.classList.add('bg-[var(--color-error-light)]', 'border', 'border-[var(--color-error)]/30');
statusLabel.className = 'font-medium text-[var(--color-error)]';
break;
}
}
// Currently streaming assistant message element (for incremental updates)
let streamingEntry = null;
function addTranscript(role, text) {
// Remove the placeholder if present
const placeholder = transcript.querySelector('.italic');
if (placeholder) placeholder.remove();
const entry = document.createElement('div');
entry.className = 'flex items-start space-x-2';
const icon = document.createElement('i');
const msg = document.createElement('p');
msg.className = 'text-[var(--color-text-primary)]';
msg.textContent = text;
if (role === 'user') {
icon.className = 'fa-solid fa-user text-[var(--color-primary)] mt-1 flex-shrink-0';
} else {
icon.className = 'fa-solid fa-robot text-[var(--color-accent)] mt-1 flex-shrink-0';
}
entry.appendChild(icon);
entry.appendChild(msg);
transcript.appendChild(entry);
transcript.scrollTop = transcript.scrollHeight;
return entry;
}
function updateStreamingTranscript(role, delta) {
if (!streamingEntry) {
streamingEntry = addTranscript(role, delta);
} else {
const msg = streamingEntry.querySelector('p');
if (msg) msg.textContent += delta;
transcript.scrollTop = transcript.scrollHeight;
}
}
function finalizeStreamingTranscript(role, fullText) {
if (streamingEntry) {
const msg = streamingEntry.querySelector('p');
if (msg) msg.textContent = fullText;
streamingEntry = null;
} else {
addTranscript(role, fullText);
}
transcript.scrollTop = transcript.scrollHeight;
}
// Send a session.update event with the user's settings
function sendSessionUpdate() {
if (!dc || dc.readyState !== 'open') return;
const instructions = document.getElementById('instructionsInput').value.trim();
const voice = document.getElementById('voiceInput').value.trim();
const language = document.getElementById('languageInput').value.trim();
// Only send if the user configured something
if (!instructions && !voice && !language) return;
const session = {};
if (instructions) {
session.instructions = instructions;
}
if (voice || language) {
session.audio = {};
if (voice) {
session.audio.output = { voice: voice };
}
if (language) {
session.audio.input = {
transcription: { language: language }
};
}
}
const event = {
type: 'session.update',
session: session,
};
console.log('[session.update]', event);
dc.send(JSON.stringify(event));
}
function handleServerEvent(event) {
console.log('[event]', event.type, event);
switch (event.type) {
case 'session.created':
// Session is ready — send any user settings
sendSessionUpdate();
setStatus('listening', 'Listening...');
break;
case 'session.updated':
console.log('[session.updated] Session settings applied', event.session);
break;
case 'input_audio_buffer.speech_started':
setStatus('listening', 'Hearing you speak...');
break;
case 'input_audio_buffer.speech_stopped':
setStatus('thinking', 'Processing...');
break;
case 'conversation.item.input_audio_transcription.completed':
if (event.transcript) {
addTranscript('user', event.transcript);
}
setStatus('thinking', 'Generating response...');
break;
case 'response.output_audio_transcript.delta':
// Incremental transcript — update the in-progress assistant message
if (event.delta) {
updateStreamingTranscript('assistant', event.delta);
}
break;
case 'response.output_audio_transcript.done':
if (event.transcript) {
finalizeStreamingTranscript('assistant', event.transcript);
}
break;
case 'response.output_audio.delta':
setStatus('speaking', 'Speaking...');
break;
case 'response.done':
setStatus('listening', 'Listening...');
break;
case 'error':
console.error('Server error:', event.error);
hasError = true;
setStatus('error', 'Error: ' + (event.error?.message || 'Unknown error'));
break;
}
}
async function connect() {
const model = getModel();
if (!model) {
alert('Please select a pipeline model first.');
return;
}
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
setStatus('error', 'Microphone access requires HTTPS or localhost.');
return;
}
setStatus('connecting', 'Connecting...');
connectButton.style.display = 'none';
disconnectButton.style.display = '';
testToneButton.style.display = '';
diagnosticsButton.style.display = '';
try {
// Get microphone access
localStream = await navigator.mediaDevices.getUserMedia({ audio: true });
// Create peer connection
pc = new RTCPeerConnection({});
// Add local audio track
for (const track of localStream.getAudioTracks()) {
pc.addTrack(track, localStream);
}
// Handle remote audio track (server's TTS output)
pc.ontrack = (event) => {
audioPlayback.srcObject = event.streams[0];
// If diagnostics panel is open, start analyzing the new stream
if (diagVisible) startDiagnostics();
};
// Create the events data channel (client must create it so m=application
// is included in the SDP offer — the answerer cannot add new m-lines)
dc = pc.createDataChannel('oai-events');
dc.onmessage = (msg) => {
try {
const text = typeof msg.data === 'string'
? msg.data
: new TextDecoder().decode(msg.data);
const event = JSON.parse(text);
handleServerEvent(event);
} catch (e) {
console.error('Failed to parse server event:', e);
}
};
dc.onclose = () => {
console.log('Data channel closed');
};
pc.onconnectionstatechange = () => {
console.log('Connection state:', pc.connectionState);
if (pc.connectionState === 'connected') {
setStatus('connected', 'Connected, waiting for session...');
} else if (pc.connectionState === 'failed' || pc.connectionState === 'closed') {
disconnect();
}
};
// Create offer
const offer = await pc.createOffer();
await pc.setLocalDescription(offer);
// Wait for ICE gathering
await new Promise((resolve) => {
if (pc.iceGatheringState === 'complete') {
resolve();
} else {
pc.onicegatheringstatechange = () => {
if (pc.iceGatheringState === 'complete') resolve();
};
// Timeout after 5s
setTimeout(resolve, 5000);
}
});
// Send offer to server
const response = await fetch('v1/realtime/calls', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
sdp: pc.localDescription.sdp,
model: model,
}),
});
if (!response.ok) {
const err = await response.json().catch(() => ({ error: 'Unknown error' }));
throw new Error(err.error || `HTTP ${response.status}`);
}
const data = await response.json();
// Set remote description (server's answer)
await pc.setRemoteDescription({
type: 'answer',
sdp: data.sdp,
});
console.log('WebRTC connection established, session:', data.session_id);
} catch (err) {
console.error('Connection failed:', err);
hasError = true;
setStatus('error', 'Connection failed: ' + err.message);
disconnect();
}
}
function sendTestTone() {
if (!dc || dc.readyState !== 'open') {
console.warn('Data channel not open');
return;
}
console.log('[test-tone] Requesting server test tone...');
dc.send(JSON.stringify({ type: 'test_tone' }));
addTranscript('assistant', '(Test tone requested — you should hear a 440 Hz beep)');
}
function disconnect() {
stopDiagnostics();
if (dc) {
dc.close();
dc = null;
}
if (pc) {
pc.close();
pc = null;
}
if (localStream) {
localStream.getTracks().forEach(t => t.stop());
localStream = null;
}
audioPlayback.srcObject = null;
if (!hasError) {
setStatus('disconnected', 'Disconnected');
}
hasError = false;
connectButton.style.display = '';
disconnectButton.style.display = 'none';
testToneButton.style.display = 'none';
diagnosticsButton.style.display = 'none';
}
// ── Audio Diagnostics ──
function toggleDiagnostics() {
const panel = document.getElementById('diagnosticsPanel');
diagVisible = !diagVisible;
panel.style.display = diagVisible ? '' : 'none';
if (diagVisible) {
startDiagnostics();
} else {
stopDiagnostics();
}
}
function startDiagnostics() {
if (!audioPlayback.srcObject) return;
// Create AudioContext and connect the remote stream to an AnalyserNode
if (!audioCtx) {
audioCtx = new AudioContext();
const source = audioCtx.createMediaStreamSource(audioPlayback.srcObject);
analyser = audioCtx.createAnalyser();
analyser.fftSize = 8192;
analyser.smoothingTimeConstant = 0.3;
source.connect(analyser);
document.getElementById('statSampleRate').textContent = audioCtx.sampleRate + ' Hz';
}
// Start rendering loop
if (!diagAnimFrame) {
drawDiagnostics();
}
// Start WebRTC stats polling
if (!statsInterval) {
pollWebRTCStats();
statsInterval = setInterval(pollWebRTCStats, 1000);
}
}
function stopDiagnostics() {
if (diagAnimFrame) {
cancelAnimationFrame(diagAnimFrame);
diagAnimFrame = null;
}
if (statsInterval) {
clearInterval(statsInterval);
statsInterval = null;
}
if (audioCtx) {
audioCtx.close();
audioCtx = null;
analyser = null;
}
}
function drawDiagnostics() {
if (!analyser || !diagVisible) {
diagAnimFrame = null;
return;
}
diagAnimFrame = requestAnimationFrame(drawDiagnostics);
// ── Waveform ──
const waveCanvas = document.getElementById('waveformCanvas');
const wCtx = waveCanvas.getContext('2d');
const timeData = new Float32Array(analyser.fftSize);
analyser.getFloatTimeDomainData(timeData);
const w = waveCanvas.width;
const h = waveCanvas.height;
wCtx.fillStyle = '#000';
wCtx.fillRect(0, 0, w, h);
wCtx.strokeStyle = '#0f0';
wCtx.lineWidth = 1;
wCtx.beginPath();
const sliceWidth = w / timeData.length;
let x = 0;
for (let i = 0; i < timeData.length; i++) {
const y = (1 - timeData[i]) * h / 2;
if (i === 0) wCtx.moveTo(x, y);
else wCtx.lineTo(x, y);
x += sliceWidth;
}
wCtx.stroke();
// Compute RMS
let sumSq = 0;
for (let i = 0; i < timeData.length; i++) sumSq += timeData[i] * timeData[i];
const rms = Math.sqrt(sumSq / timeData.length);
const rmsDb = rms > 0 ? (20 * Math.log10(rms)).toFixed(1) : '-Inf';
document.getElementById('statRMS').textContent = rmsDb + ' dBFS';
// ── FFT Spectrum ──
const specCanvas = document.getElementById('spectrumCanvas');
const sCtx = specCanvas.getContext('2d');
const freqData = new Float32Array(analyser.frequencyBinCount);
analyser.getFloatFrequencyData(freqData);
const sw = specCanvas.width;
const sh = specCanvas.height;
sCtx.fillStyle = '#000';
sCtx.fillRect(0, 0, sw, sh);
// Draw spectrum (0 to 4kHz range for speech/tone analysis)
const sampleRate = audioCtx.sampleRate;
const binHz = sampleRate / analyser.fftSize;
const maxFreqDisplay = 4000;
const maxBin = Math.min(Math.ceil(maxFreqDisplay / binHz), freqData.length);
const barWidth = sw / maxBin;
sCtx.fillStyle = '#0cf';
let peakBin = 0;
let peakVal = -Infinity;
for (let i = 0; i < maxBin; i++) {
const db = freqData[i];
if (db > peakVal) {
peakVal = db;
peakBin = i;
}
// Map dB (-100 to 0) to pixel height
const barH = Math.max(0, ((db + 100) / 100) * sh);
sCtx.fillRect(i * barWidth, sh - barH, Math.max(1, barWidth - 0.5), barH);
}
// Draw frequency labels
sCtx.fillStyle = '#888';
sCtx.font = '10px monospace';
for (let f = 500; f <= maxFreqDisplay; f += 500) {
const xPos = (f / binHz) * barWidth;
sCtx.fillText(f + '', xPos - 10, sh - 2);
}
// Mark 440 Hz
const bin440 = Math.round(440 / binHz);
const x440 = bin440 * barWidth;
sCtx.strokeStyle = '#f00';
sCtx.lineWidth = 1;
sCtx.beginPath();
sCtx.moveTo(x440, 0);
sCtx.lineTo(x440, sh);
sCtx.stroke();
sCtx.fillStyle = '#f00';
sCtx.fillText('440', x440 + 2, 10);
const peakFreq = peakBin * binHz;
document.getElementById('statPeakFreq').textContent =
peakFreq.toFixed(0) + ' Hz (' + peakVal.toFixed(1) + ' dB)';
// Compute THD (Total Harmonic Distortion) relative to 440 Hz
// THD = sqrt(sum of harmonic powers / fundamental power)
const fundamentalBin = Math.round(440 / binHz);
const fundamentalPower = Math.pow(10, freqData[fundamentalBin] / 10);
let harmonicPower = 0;
for (let h = 2; h <= 10; h++) {
const hBin = Math.round(440 * h / binHz);
if (hBin < freqData.length) {
harmonicPower += Math.pow(10, freqData[hBin] / 10);
}
}
const thd = fundamentalPower > 0
? (Math.sqrt(harmonicPower / fundamentalPower) * 100).toFixed(1)
: '--';
document.getElementById('statTHD').textContent = thd + '%';
}
async function pollWebRTCStats() {
if (!pc) return;
try {
const stats = await pc.getStats();
const raw = [];
stats.forEach((report) => {
if (report.type === 'inbound-rtp' && report.kind === 'audio') {
document.getElementById('statPacketsRecv').textContent =
report.packetsReceived ?? '--';
document.getElementById('statPacketsLost').textContent =
report.packetsLost ?? '--';
document.getElementById('statJitter').textContent =
report.jitter !== undefined ? (report.jitter * 1000).toFixed(1) + ' ms' : '--';
document.getElementById('statConcealed').textContent =
report.concealedSamples ?? '--';
raw.push('── inbound-rtp (audio) ──');
raw.push(' packetsReceived: ' + report.packetsReceived);
raw.push(' packetsLost: ' + report.packetsLost);
raw.push(' jitter: ' + (report.jitter !== undefined ? (report.jitter * 1000).toFixed(2) + ' ms' : 'N/A'));
raw.push(' bytesReceived: ' + report.bytesReceived);
raw.push(' concealedSamples: ' + report.concealedSamples);
raw.push(' silentConcealedSamples: ' + report.silentConcealedSamples);
raw.push(' totalSamplesReceived: ' + report.totalSamplesReceived);
raw.push(' insertedSamplesForDecel: ' + report.insertedSamplesForDeceleration);
raw.push(' removedSamplesForAccel: ' + report.removedSamplesForAcceleration);
raw.push(' jitterBufferDelay: ' + (report.jitterBufferDelay !== undefined ? report.jitterBufferDelay.toFixed(3) + ' s' : 'N/A'));
raw.push(' jitterBufferTargetDelay: ' + (report.jitterBufferTargetDelay !== undefined ? report.jitterBufferTargetDelay.toFixed(3) + ' s' : 'N/A'));
raw.push(' jitterBufferEmittedCount: ' + report.jitterBufferEmittedCount);
}
});
document.getElementById('statsRaw').textContent = raw.join('\n');
} catch (e) {
console.warn('Stats polling error:', e);
}
}