feat: add prefill progress bar for long prompts

Shows real-time progress during prompt processing (prefill phase). Progress is sent via SSE named events that maintain OpenAI API compatibility. - Add PrefillProgress event type - Wire prompt_progress_callback through MLX stream_generate - Send progress events directly from callback for real-time updates - Add PrefillProgressBar.svelte component - Parse event: prefill_progress SSE events in dashboard Note: prefill_step_size temporarily set to 256 for testing (normally 2048) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
feat: add uncertainty visualization with token-level logprobs
2026-01-17 18:41:49 -05:00 · 2026-01-17 17:12:59 +00:00 · 2026-01-17 16:45:23 +00:00 · 2026-01-16 15:14:21 +00:00 · 2026-01-15 20:00:41 +00:00 · 2026-01-15 19:53:55 +00:00
32 changed files with 2332 additions and 369 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4340,25 +4340,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "system_custodian"
-version = "0.0.1"
-dependencies = [
- "delegate",
- "derive_more",
- "either",
- "extend",
- "futures",
- "futures-timer",
- "impl-trait-for-tuples",
- "keccak-const",
- "log",
- "thiserror 2.0.17",
- "tokio",
- "tracing-subscriber",
- "util",
-]
-
 [[package]]
 name = "tagptr"
 version = "0.2.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,6 @@ resolver = "3"
 members = [
    "rust/networking",
    "rust/exo_pyo3_bindings",
-    "rust/system_custodian",
    "rust/util",
 ]

@@ -25,7 +24,6 @@ opt-level = 3
 [workspace.dependencies]
 ## Crate members as common dependencies
 networking = { path = "rust/networking" }
-system_custodian = { path = "rust/system_custodian" }
 util = { path = "rust/util" }

 # Proc-macro authoring tools
--- a/dashboard/package-lock.json
+++ b/dashboard/package-lock.json
@@ -863,6 +863,7 @@
 			"integrity": "sha512-oH8tXw7EZnie8FdOWYrF7Yn4IKrqTFHhXvl8YxXxbKwTMcD/5NNCryUSEXRk2ZR4ojnub0P8rNrsVGHXWqIDtA==",
 			"dev": true,
 			"license": "MIT",
+			"peer": true,
 			"dependencies": {
 				"@standard-schema/spec": "^1.0.0",
 				"@sveltejs/acorn-typescript": "^1.0.5",
@@ -902,6 +903,7 @@
 			"integrity": "sha512-Y1Cs7hhTc+a5E9Va/xwKlAJoariQyHY+5zBgCZg4PFWNYQ1nMN9sjK1zhw1gK69DuqVP++sht/1GZg1aRwmAXQ==",
 			"dev": true,
 			"license": "MIT",
+			"peer": true,
 			"dependencies": {
 				"@sveltejs/vite-plugin-svelte-inspector": "^4.0.1",
 				"debug": "^4.4.1",
@@ -1518,6 +1520,7 @@
 			"integrity": "sha512-LCCV0HdSZZZb34qifBsyWlUmok6W7ouER+oQIGBScS8EsZsQbrtFTUrDX4hOl+CS6p7cnNC4td+qrSVGSCTUfQ==",
 			"dev": true,
 			"license": "MIT",
+			"peer": true,
 			"dependencies": {
 				"undici-types": "~6.21.0"
 			}
@@ -1527,6 +1530,7 @@
 			"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
 			"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
 			"license": "MIT",
+			"peer": true,
 			"bin": {
 				"acorn": "bin/acorn"
 			},
@@ -1939,6 +1943,7 @@
 			"integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==",
 			"dev": true,
 			"license": "ISC",
+			"peer": true,
 			"engines": {
 				"node": ">=12"
 			}
@@ -2646,6 +2651,7 @@
 			"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
 			"dev": true,
 			"license": "MIT",
+			"peer": true,
 			"engines": {
 				"node": ">=12"
 			},
@@ -2833,6 +2839,7 @@
 			"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.45.3.tgz",
 			"integrity": "sha512-ngKXNhNvwPzF43QqEhDOue7TQTrG09em1sd4HBxVF0Wr2gopAmdEWan+rgbdgK4fhBtSOTJO8bYU4chUG7VXZQ==",
 			"license": "MIT",
+			"peer": true,
 			"dependencies": {
 				"@jridgewell/remapping": "^2.3.4",
 				"@jridgewell/sourcemap-codec": "^1.5.0",
@@ -2977,6 +2984,7 @@
 			"integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
 			"dev": true,
 			"license": "Apache-2.0",
+			"peer": true,
 			"bin": {
 				"tsc": "bin/tsc",
 				"tsserver": "bin/tsserver"
@@ -2998,6 +3006,7 @@
 			"integrity": "sha512-+Oxm7q9hDoLMyJOYfUYBuHQo+dkAloi33apOPP56pzj+vsdJDzr+j1NISE5pyaAuKL4A3UD34qd0lx5+kfKp2g==",
 			"dev": true,
 			"license": "MIT",
+			"peer": true,
 			"dependencies": {
 				"esbuild": "^0.25.0",
 				"fdir": "^6.4.4",
--- a/dashboard/src/lib/components/ChatForm.svelte
+++ b/dashboard/src/lib/components/ChatForm.svelte
@@ -60,12 +60,39 @@
 		return models;
 	});

-	// Auto-select the first available model if none is selected
+	// Track previous model IDs to detect newly added models (plain variable to avoid reactive loop)
+	let previousModelIds: Set<string> = new Set();
+
+	// Auto-select the first available model if none is selected, if current selection is stale, or if a new model is added
 	$effect(() => {
 		const models = availableModels();
-		if (models.length > 0 && !currentModel) {
-			setSelectedChatModel(models[0].id);
+		const currentModelIds = new Set(models.map(m => m.id));
+
+		if (models.length > 0) {
+			// Find newly added models (in current but not in previous)
+			const newModels = models.filter(m => !previousModelIds.has(m.id));
+
+			// If no model selected, select the first available
+			if (!currentModel) {
+				setSelectedChatModel(models[0].id);
+			}
+			// If current model is stale (no longer has a running instance), reset to first available
+			else if (!models.some(m => m.id === currentModel)) {
+				setSelectedChatModel(models[0].id);
+			}
+			// If a new model was just added, select it
+			else if (newModels.length > 0 && previousModelIds.size > 0) {
+				setSelectedChatModel(newModels[0].id);
+			}
+		} else {
+			// No instances running - clear the selected model
+			if (currentModel) {
+				setSelectedChatModel('');
+			}
 		}
+
+		// Update previous model IDs for next comparison
+		previousModelIds = currentModelIds;
 	});

 	function getInstanceModelId(instanceWrapped: unknown): string {
--- a/dashboard/src/lib/components/ChatMessages.svelte
+++ b/dashboard/src/lib/components/ChatMessages.svelte
@@ -1,7 +1,7 @@
 <script lang="ts">
-	import { 
-		messages, 
-		currentResponse, 
+	import {
+		messages,
+		currentResponse,
 		isLoading,
 		deleteMessage,
 		editAndRegenerate,
@@ -9,6 +9,8 @@
 	} from '$lib/stores/app.svelte';
 	import type { MessageAttachment } from '$lib/stores/app.svelte';
 	import MarkdownContent from './MarkdownContent.svelte';
+	import TokenHeatmap from './TokenHeatmap.svelte';
+	import PrefillProgressBar from './PrefillProgressBar.svelte';

 	interface Props {
 		class?: string;
@@ -95,6 +97,23 @@
 let copiedMessageId = $state<string | null>(null);
 let expandedThinkingMessageIds = $state<Set<string>>(new Set());

+// Uncertainty view state - tracks which messages show token heatmap
+let uncertaintyViewMessageIds = $state<Set<string>>(new Set());
+
+function toggleUncertaintyView(messageId: string) {
+	const newSet = new Set(uncertaintyViewMessageIds);
+	if (newSet.has(messageId)) {
+		newSet.delete(messageId);
+	} else {
+		newSet.add(messageId);
+	}
+	uncertaintyViewMessageIds = newSet;
+}
+
+function isUncertaintyViewEnabled(messageId: string): boolean {
+	return uncertaintyViewMessageIds.has(messageId);
+}
+
 	function formatTimestamp(timestamp: number): string {
 		return new Date(timestamp).toLocaleTimeString('en-US', { 
 			hour12: false,
@@ -330,6 +349,10 @@ function isThinkingExpanded(messageId: string): boolean {
 						{:else}
 							<!-- Assistant message styling -->
 							<div class="p-3 sm:p-4">
+								{#if message.prefillProgress}
+									<!-- Prefill progress bar -->
+									<PrefillProgressBar progress={message.prefillProgress} class="mb-3" />
+								{/if}
 								{#if message.thinking && message.thinking.trim().length > 0}
 									<div class="mb-3 rounded border border-exo-yellow/20 bg-exo-black/40">
 										<button
@@ -366,7 +389,13 @@ function isThinkingExpanded(messageId: string): boolean {
 									</div>
 								{/if}
 								<div class="text-xs text-foreground">
-									<MarkdownContent content={message.content || (loading ? response : '')} />
+									{#if message.role === 'assistant' && isUncertaintyViewEnabled(message.id) && message.tokens && message.tokens.length > 0}
+										<!-- Uncertainty heatmap view -->
+										<TokenHeatmap tokens={message.tokens} />
+									{:else}
+										<!-- Normal markdown view -->
+										<MarkdownContent content={message.content || (loading ? response : '')} />
+									{/if}
 									{#if loading && !message.content}
 										<span class="inline-block w-2 h-4 bg-exo-yellow/70 ml-1 cursor-blink"></span>
 									{/if}
@@ -419,6 +448,19 @@ function isThinkingExpanded(messageId: string): boolean {
 								</svg>
 							</button>
 						{/if}
+
+						<!-- Uncertainty view toggle (assistant messages with tokens only) -->
+						{#if message.role === 'assistant' && message.tokens && message.tokens.length > 0}
+							<button
+								onclick={() => toggleUncertaintyView(message.id)}
+								class="p-1.5 transition-colors rounded cursor-pointer {isUncertaintyViewEnabled(message.id) ? 'text-exo-yellow' : 'text-exo-light-gray hover:text-exo-yellow'}"
+								title={isUncertaintyViewEnabled(message.id) ? 'Hide uncertainty' : 'Show uncertainty'}
+							>
+								<svg class="w-3.5 h-3.5" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+									<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 19v-6a2 2 0 00-2-2H5a2 2 0 00-2 2v6a2 2 0 002 2h2a2 2 0 002-2zm0 0V9a2 2 0 012-2h2a2 2 0 012 2v10m-6 0a2 2 0 002 2h2a2 2 0 002-2m0 0V5a2 2 0 012-2h2a2 2 0 012 2v14a2 2 0 01-2 2h-2a2 2 0 01-2-2z" />
+								</svg>
+							</button>
+						{/if}
 						
 						<!-- Delete button -->
 						<button
--- a/dashboard/src/lib/components/PrefillProgressBar.svelte
+++ b/dashboard/src/lib/components/PrefillProgressBar.svelte
@@ -0,0 +1,67 @@
+<script lang="ts">
+	import type { PrefillProgress } from '$lib/stores/app.svelte';
+
+	interface Props {
+		progress: PrefillProgress;
+		class?: string;
+	}
+
+	let { progress, class: className = '' }: Props = $props();
+
+	const percentage = $derived(
+		progress.total > 0 ? Math.round((progress.processed / progress.total) * 100) : 0
+	);
+
+	function formatTokenCount(count: number): string {
+		if (count >= 1000) {
+			return `${(count / 1000).toFixed(1)}k`;
+		}
+		return count.toString();
+	}
+</script>
+
+<div class="prefill-progress {className}">
+	<div class="flex items-center justify-between text-xs text-gray-400 mb-1">
+		<span class="flex items-center gap-1.5">
+			<svg
+				class="w-3.5 h-3.5 animate-spin"
+				fill="none"
+				viewBox="0 0 24 24"
+				xmlns="http://www.w3.org/2000/svg"
+			>
+				<circle
+					class="opacity-25"
+					cx="12"
+					cy="12"
+					r="10"
+					stroke="currentColor"
+					stroke-width="4"
+				></circle>
+				<path
+					class="opacity-75"
+					fill="currentColor"
+					d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"
+				></path>
+			</svg>
+			<span>Processing prompt</span>
+		</span>
+		<span class="font-mono">
+			{formatTokenCount(progress.processed)} / {formatTokenCount(progress.total)} tokens
+		</span>
+	</div>
+	<div class="h-1.5 bg-gray-700 rounded-full overflow-hidden">
+		<div
+			class="h-full bg-blue-500 rounded-full transition-all duration-150 ease-out"
+			style="width: {percentage}%"
+		></div>
+	</div>
+	<div class="text-right text-xs text-gray-500 mt-0.5">
+		{percentage}%
+	</div>
+</div>
+
+<style>
+	.prefill-progress {
+		width: 100%;
+	}
+</style>
--- a/dashboard/src/lib/components/TokenHeatmap.svelte
+++ b/dashboard/src/lib/components/TokenHeatmap.svelte
@@ -0,0 +1,121 @@
+<script lang="ts">
+	import type { TokenData } from '$lib/stores/app.svelte';
+
+	interface Props {
+		tokens: TokenData[];
+		class?: string;
+	}
+
+	let { tokens, class: className = '' }: Props = $props();
+
+	// Tooltip state
+	let hoveredToken = $state<{ token: TokenData; x: number; y: number } | null>(null);
+
+	/**
+	 * Get confidence level based on probability
+	 * High: >0.8 (logprob > -0.22)
+	 * Medium: 0.5-0.8 (logprob -0.69 to -0.22)
+	 * Low: 0.2-0.5 (logprob -1.61 to -0.69)
+	 * Very Low: <0.2 (logprob < -1.61)
+	 */
+	function getConfidenceClass(probability: number): string {
+		if (probability > 0.8) return 'bg-green-500/30 text-green-100';
+		if (probability > 0.5) return 'bg-yellow-500/30 text-yellow-100';
+		if (probability > 0.2) return 'bg-orange-500/30 text-orange-100';
+		return 'bg-red-500/40 text-red-100';
+	}
+
+	/**
+	 * Get border color for token based on probability
+	 */
+	function getBorderClass(probability: number): string {
+		if (probability > 0.8) return 'border-green-500/50';
+		if (probability > 0.5) return 'border-yellow-500/50';
+		if (probability > 0.2) return 'border-orange-500/50';
+		return 'border-red-500/50';
+	}
+
+	function handleMouseEnter(event: MouseEvent, token: TokenData) {
+		const rect = (event.target as HTMLElement).getBoundingClientRect();
+		hoveredToken = {
+			token,
+			x: rect.left + rect.width / 2,
+			y: rect.top - 10
+		};
+	}
+
+	function handleMouseLeave() {
+		hoveredToken = null;
+	}
+
+	function formatProbability(prob: number): string {
+		return (prob * 100).toFixed(1) + '%';
+	}
+
+	function formatLogprob(logprob: number): string {
+		return logprob.toFixed(3);
+	}
+</script>
+
+<div class="token-heatmap leading-relaxed {className}">
+	{#each tokens as tokenData, i (i)}
+		<span
+			role="button"
+			tabindex="0"
+			class="token-span inline rounded px-0.5 py-0.5 cursor-pointer transition-all duration-150 border {getConfidenceClass(tokenData.probability)} {getBorderClass(tokenData.probability)} hover:opacity-80"
+			onmouseenter={(e) => handleMouseEnter(e, tokenData)}
+			onmouseleave={handleMouseLeave}
+		>{tokenData.token}</span>
+	{/each}
+</div>
+
+<!-- Tooltip -->
+{#if hoveredToken}
+	<div
+		class="fixed z-50 pointer-events-none"
+		style="left: {hoveredToken.x}px; top: {hoveredToken.y}px; transform: translate(-50%, -100%);"
+	>
+		<div class="bg-gray-900 border border-gray-700 rounded-lg shadow-xl p-3 text-sm min-w-48">
+			<!-- Token info -->
+			<div class="mb-2">
+				<span class="text-gray-400 text-xs">Token:</span>
+				<span class="text-white font-mono ml-1">"{hoveredToken.token.token}"</span>
+				<span class="text-green-400 ml-2">{formatProbability(hoveredToken.token.probability)}</span>
+			</div>
+
+			<div class="text-gray-400 text-xs mb-1">
+				logprob: <span class="text-gray-300 font-mono">{formatLogprob(hoveredToken.token.logprob)}</span>
+			</div>
+
+			<!-- Top alternatives -->
+			{#if hoveredToken.token.topLogprobs.length > 0}
+				<div class="border-t border-gray-700 mt-2 pt-2">
+					<div class="text-gray-400 text-xs mb-1">Alternatives:</div>
+					{#each hoveredToken.token.topLogprobs.slice(0, 5) as alt, idx (idx)}
+						{@const altProb = Math.exp(alt.logprob)}
+						<div class="flex justify-between items-center text-xs py-0.5">
+							<span class="text-gray-300 font-mono truncate max-w-24">"{alt.token}"</span>
+							<span class="text-gray-400 ml-2">{formatProbability(altProb)}</span>
+						</div>
+					{/each}
+				</div>
+			{/if}
+		</div>
+		<!-- Arrow -->
+		<div class="absolute left-1/2 -translate-x-1/2 top-full">
+			<div class="border-8 border-transparent border-t-gray-900"></div>
+		</div>
+	</div>
+{/if}
+
+<style>
+	.token-heatmap {
+		word-wrap: break-word;
+		white-space: pre-wrap;
+	}
+
+	.token-span {
+		margin: 0;
+		border-width: 1px;
+	}
+</style>
--- a/dashboard/src/lib/stores/app.svelte.ts
+++ b/dashboard/src/lib/stores/app.svelte.ts
@@ -182,6 +182,26 @@ export interface MessageAttachment {
 	mimeType?: string;
 }

+// Token-level data for uncertainty visualization
+export interface TopLogprob {
+	token: string;
+	logprob: number;
+	bytes?: number[];
+}
+
+export interface TokenData {
+	token: string;
+	logprob: number;
+	probability: number; // exp(logprob)
+	topLogprobs: TopLogprob[];
+}
+
+// Prefill progress data for long prompts
+export interface PrefillProgress {
+	processed: number;
+	total: number;
+}
+
 export interface Message {
 	id: string;
 	role: "user" | "assistant" | "system";
@@ -191,6 +211,8 @@ export interface Message {
 	attachments?: MessageAttachment[];
 	ttftMs?: number; // Time to first token in ms (for assistant messages)
 	tps?: number; // Tokens per second (for assistant messages)
+	tokens?: TokenData[]; // Token-level data for uncertainty visualization
+	prefillProgress?: PrefillProgress | null; // Prefill progress for long prompts
 }

 export interface Conversation {
@@ -1107,6 +1129,8 @@ class AppStore {
 					model: modelToUse,
 					messages: apiMessages,
 					stream: true,
+					logprobs: true,
+					top_logprobs: 5,
 				}),
 			});

@@ -1408,6 +1432,8 @@ class AppStore {
 					messages: apiMessages,
 					temperature: 0.7,
 					stream: true,
+					logprobs: true,
+					top_logprobs: 5,
 				}),
 			});

@@ -1424,6 +1450,8 @@ class AppStore {
 			const decoder = new TextDecoder();
 			let fullContent = "";
 			let buffer = "";
+			const collectedTokens: TokenData[] = [];
+			let currentEventType = ""; // Track SSE event type

 			while (true) {
 				const { done, value } = await reader.read();
@@ -1437,14 +1465,43 @@ class AppStore {

 				for (const line of lines) {
 					const trimmed = line.trim();
-					if (!trimmed) continue;
+					if (!trimmed) {
+						// Empty line resets event type
+						currentEventType = "";
+						continue;
+					}
+
+					// Handle event type declaration
+					if (trimmed.startsWith("event: ")) {
+						currentEventType = trimmed.slice(7);
+						continue;
+					}

 					if (trimmed.startsWith("data: ")) {
 						const data = trimmed.slice(6);
-						if (data === "[DONE]") continue;
+						if (data === "[DONE]") {
+							currentEventType = "";
+							continue;
+						}

 						try {
 							const parsed = JSON.parse(data);
+
+							// Handle prefill progress events
+							if (currentEventType === "prefill_progress") {
+								const idx = this.messages.findIndex(
+									(m) => m.id === assistantMessage.id,
+								);
+								if (idx !== -1) {
+									this.messages[idx].prefillProgress = {
+										processed: parsed.processed,
+										total: parsed.total,
+									};
+								}
+								continue;
+							}
+
+							// Handle regular token data
 							const tokenContent = parsed.choices?.[0]?.delta?.content;
 							if (tokenContent) {
 								// Track first token for TTFT
@@ -1453,6 +1510,14 @@ class AppStore {
 									this.ttftMs = firstTokenTime - requestStartTime;
 								}

+								// Clear prefill progress when first token arrives
+								const msgIdx = this.messages.findIndex(
+									(m) => m.id === assistantMessage.id,
+								);
+								if (msgIdx !== -1 && this.messages[msgIdx].prefillProgress) {
+									this.messages[msgIdx].prefillProgress = null;
+								}
+
 								// Count tokens (each SSE chunk is typically one token)
 								tokenCount += 1;
 								this.totalTokens = tokenCount;
@@ -1463,6 +1528,25 @@ class AppStore {
 									this.tps = (tokenCount / elapsed) * 1000;
 								}

+								// Extract logprobs for uncertainty visualization
+								const logprobsData = parsed.choices?.[0]?.logprobs;
+								if (logprobsData?.content?.[0]) {
+									const logprobItem = logprobsData.content[0];
+									const tokenData: TokenData = {
+										token: logprobItem.token || tokenContent,
+										logprob: logprobItem.logprob ?? 0,
+										probability: Math.exp(logprobItem.logprob ?? 0),
+										topLogprobs: (logprobItem.top_logprobs || []).map(
+											(item: { token: string; logprob: number; bytes?: number[] }) => ({
+												token: item.token,
+												logprob: item.logprob,
+												bytes: item.bytes,
+											}),
+										),
+									};
+									collectedTokens.push(tokenData);
+								}
+
 								fullContent += tokenContent;

 								// Strip thinking tags for display and extract thinking content
@@ -1477,6 +1561,8 @@ class AppStore {
 								if (idx !== -1) {
 									this.messages[idx].content = displayContent;
 									this.messages[idx].thinking = thinkingContent || undefined;
+									// Update tokens during streaming for real-time visualization
+									this.messages[idx].tokens = [...collectedTokens];
 								}
 								this.persistActiveConversation();
 							}
@@ -1524,6 +1610,10 @@ class AppStore {
 				if (this.tps !== null) {
 					this.messages[idx].tps = this.tps;
 				}
+				// Store token data for uncertainty visualization
+				if (collectedTokens.length > 0) {
+					this.messages[idx].tokens = collectedTokens;
+				}
 			}
 			this.persistActiveConversation();
 		} catch (error) {
--- a/dashboard/src/routes/+page.svelte
+++ b/dashboard/src/routes/+page.svelte
@@ -400,10 +400,8 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
 				const errorText = await response.text();
 				console.error('Failed to launch instance:', errorText);
 			} else {
-				// Auto-select the launched model only if no model is currently selected
-				if (!selectedChatModel()) {
-					setSelectedChatModel(modelId);
-				}
+				// Always auto-select the newly launched model so the user chats to what they just launched
+				setSelectedChatModel(modelId);
 				
 				// Scroll to the bottom of instances container to show the new instance
 				// Use multiple attempts to ensure DOM has updated with the new instance
@@ -763,6 +761,10 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
 	async function deleteInstance(instanceId: string) {
 		if (!confirm(`Delete instance ${instanceId.slice(0, 8)}...?`)) return;
 		
+		// Get the model ID of the instance being deleted before we delete it
+		const deletedInstanceModelId = getInstanceModelId(instanceData[instanceId]);
+		const wasSelected = selectedChatModel() === deletedInstanceModelId;
+		
 		try {
 			const response = await fetch(`/instance/${instanceId}`, {
 				method: 'DELETE',
@@ -771,6 +773,24 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
 			
 			if (!response.ok) {
 				console.error('Failed to delete instance:', response.status);
+			} else if (wasSelected) {
+				// If we deleted the currently selected model, switch to another available model
+				// Find another instance that isn't the one we just deleted
+				const remainingInstances = Object.entries(instanceData).filter(([id]) => id !== instanceId);
+				if (remainingInstances.length > 0) {
+					// Select the last instance (most recently added, since objects preserve insertion order)
+					const [, lastInstance] = remainingInstances[remainingInstances.length - 1];
+					const newModelId = getInstanceModelId(lastInstance);
+					if (newModelId && newModelId !== 'Unknown' && newModelId !== 'Unknown Model') {
+						setSelectedChatModel(newModelId);
+					} else {
+						// Clear selection if no valid model found
+						setSelectedChatModel('');
+					}
+				} else {
+					// No more instances, clear the selection
+					setSelectedChatModel('');
+				}
 			}
 		} catch (error) {
 			console.error('Error deleting instance:', error);
--- a/2
+++ b/2
@@ -1,3 +1,5 @@
+export NIX_CONFIG := "extra-experimental-features = nix-command flakes"
+
 fmt:
    nix fmt

--- a/rust/parts.nix
+++ b/rust/parts.nix
@@ -81,20 +81,6 @@

      config = {
        packages = {
-          # The system_custodian binary
-          system_custodian = craneLib.buildPackage (
-            commonArgs
-            // {
-              inherit cargoArtifacts;
-              cargoExtraArgs = "-p system_custodian";
-
-              meta = {
-                description = "System custodian daemon for exo";
-                mainProgram = "system_custodian";
-              };
-            }
-          );
-
          # Python bindings wheel via maturin
          exo_pyo3_bindings = craneLib.buildPackage (
            commonArgs
--- a/rust/system_custodian/Cargo.toml
+++ b/rust/system_custodian/Cargo.toml
@@ -1,47 +0,0 @@
-[package]
-name = "system_custodian"
-version = { workspace = true }
-edition = { workspace = true }
-publish = false
-
-[lib]
-doctest = false
-name = "system_custodian"
-path = "src/lib.rs"
-
-[[bin]]
-path = "src/bin/main.rs"
-name = "system_custodian"
-doc = false
-
-[lints]
-workspace = true
-
-[dependencies]
-# datastructures
-either = { workspace = true }
-
-# macro dependencies
-extend = { workspace = true }
-delegate = { workspace = true }
-impl-trait-for-tuples = { workspace = true }
-derive_more = { workspace = true }
-
-# async
-tokio = { workspace = true, features = ["full"] }
-futures = { workspace = true }
-futures-timer = { workspace = true }
-
-# utility dependencies
-util = { workspace = true }
-thiserror = { workspace = true }
-#internment = { workspace = true }
-#recursion = { workspace = true }
-#generativity = { workspace = true }
-#itertools = { workspace = true }
-tracing-subscriber = { version = "0.3.19", features = ["default", "env-filter"] }
-keccak-const = { workspace = true }
-
-# tracing/logging
-log = { workspace = true }
-
--- a/rust/system_custodian/src/bin/main.rs
+++ b/rust/system_custodian/src/bin/main.rs
@@ -1,4 +0,0 @@
-//! TODO: documentation
-//!
-
-fn main() {}
--- a/rust/system_custodian/src/lib.rs
+++ b/rust/system_custodian/src/lib.rs
@@ -1,69 +0,0 @@
-//! This crate defines the logic of, and ways to interact with, Exo's **_System Custodian_** daemon.
-//!
-//! The **_System Custodian_** daemon is supposed to be a long-living process that precedes the
-//! launch of the Exo application, and responsible for ensuring the system (configuration, settings,
-//! etc.) is in an appropriate state to facilitate the running of Exo application.
-//! The **_System Custodian_** daemon shall expose a [D-Bus](https://www.freedesktop.org/wiki/Software/dbus/)
-//! service which Exo application use to _control & query_ it.
-//!
-//! # Lifecycle
-//! When the Exo application starts, it will _wake_ the **_System Custodian_** daemon for the
-//! duration of its lifetime, and after it has terminated the daemon will go back to sleep. When
-//! the daemon wakes up, it will configure the system into a state suitable for the Exo Application;
-//! When the daemon goes to sleep, it will revert those changes as much as it can in case they were
-//! destructive to the user's pre-existing configurations.
-//!
-//! # Responsibilities
-//! TODO: these are purely on MacOS, but change to be more broad
-//! The **_System Custodian_** daemon is responsible for using System Configuration framework to
-//!  1. duplicate the current network set
-//!  2. modify existing services to turn on IPv6 if not there
-//!  3. remove any bridge services & add any missing services that AREN'T bridge
-//! TODO: In the future:
-//!  1. run a dummy AWDL service to [allow for macOS peer-to-peer wireless networking](https://yggdrasil-network.github.io/2019/08/19/awdl.html)
-//!  2. toggle some GPU/memory configurations to speed up GPU (ask Alex what those configurations are)
-//!  3. if we ever decide to provide our **own network interfaces** that abstract over some userland
-//!     logic, this would be the place to spin that up.
-//!
-//! Then it will watch the SCDynamicStore for:
-//!  1. all __actual__ network interfaces -> collect information on them e.g. their BSD name, MAC
-//!     address, MTU, IPv6 addresses, etc. -> and set up watchers/notifiers to inform the DBus
-//!     interface of any changes
-//!  2. watch for any __undesirable__ changes to configuration and revert it
-//!
-//! It should somehow (probably through system sockets and/or BSD interface) trigger IPv6 NDP on
-//! each of the interfaces & also listen to/query for any changes on the OS routing cache??
-//! Basically emulate the `ping6 ff02::1%enX` and `ndp -an` commands BUT BETTER!!!
-//!  1. all that info should coalesce back to the overall state colleted -> should be queryable
-//!     over D-Bus
-//! TODO:
-//!  1. we might potentially add to this step a handshake of some kind...? To ensure that we can
-//!     ACTUALLY communicate with that machine over that link over e.g. TCP, UDP, etc. Will the
-//!     handshake require to know Node ID? Will the handshake require heartbeats? Who knows...
-//!  2. if we ever decide to write proprietary L2/L3 protocols for quicker communication,
-//!     e.g. [AF_NDRV](https://www.zerotier.com/blog/how-zerotier-eliminated-kernel-extensions-on-macos/)
-//!     for raw ethernet frame communication, or even a [custom thunderbolt PCIe driver](https://developer.apple.com/documentation/pcidriverkit/creating-custom-pcie-drivers-for-thunderbolt-devices),
-//!     then this would be the place to carry out discovery and propper handshakes with devices
-//!     on the other end of the link.
-//!
-
-// enable Rust-unstable features for convenience
-#![feature(trait_alias)]
-#![feature(stmt_expr_attributes)]
-#![feature(type_alias_impl_trait)]
-#![feature(specialization)]
-#![feature(unboxed_closures)]
-#![feature(const_trait_impl)]
-#![feature(fn_traits)]
-
-pub(crate) mod private {
-    // sealed traits support
-    pub trait Sealed {}
-    impl<T: ?Sized> Sealed for T {}
-}
-
-/// Namespace for all the type/trait aliases used by this crate.
-pub(crate) mod alias {}
-
-/// Namespace for crate-wide extension traits/methods
-pub(crate) mod ext {}
--- a/src/exo/master/adapters/init.py
+++ b/src/exo/master/adapters/init.py
@@ -0,0 +1 @@
+"""API adapters for different API formats (Claude, OpenAI Responses, etc.)."""
--- a/src/exo/master/adapters/claude.py
+++ b/src/exo/master/adapters/claude.py
@@ -0,0 +1,184 @@
+"""Claude Messages API adapter for converting requests/responses."""
+
+from collections.abc import AsyncGenerator
+
+from exo.shared.types.api import (
+    ChatCompletionChoice,
+    ChatCompletionMessage,
+    ChatCompletionResponse,
+    FinishReason,
+)
+from exo.shared.types.chunks import TokenChunk
+from exo.shared.types.claude_api import (
+    ClaudeContentBlockDeltaEvent,
+    ClaudeContentBlockStartEvent,
+    ClaudeContentBlockStopEvent,
+    ClaudeMessageDelta,
+    ClaudeMessageDeltaEvent,
+    ClaudeMessageDeltaUsage,
+    ClaudeMessagesRequest,
+    ClaudeMessagesResponse,
+    ClaudeMessageStart,
+    ClaudeMessageStartEvent,
+    ClaudeMessageStopEvent,
+    ClaudeStopReason,
+    ClaudeTextBlock,
+    ClaudeTextDelta,
+    ClaudeUsage,
+)
+from exo.shared.types.common import CommandId
+from exo.shared.types.tasks import ChatCompletionTaskParams
+
+
+def finish_reason_to_claude_stop_reason(
+    finish_reason: FinishReason | None,
+) -> ClaudeStopReason | None:
+    """Map OpenAI finish_reason to Claude stop_reason."""
+    if finish_reason is None:
+        return None
+    mapping: dict[FinishReason, ClaudeStopReason] = {
+        "stop": "end_turn",
+        "length": "max_tokens",
+        "tool_calls": "tool_use",
+        "content_filter": "end_turn",
+        "function_call": "tool_use",
+    }
+    return mapping.get(finish_reason, "end_turn")
+
+
+def claude_request_to_chat_params(
+    request: ClaudeMessagesRequest,
+) -> ChatCompletionTaskParams:
+    """Convert Claude Messages API request to internal ChatCompletionTaskParams."""
+    messages: list[ChatCompletionMessage] = []
+
+    # Add system message if present
+    if request.system:
+        if isinstance(request.system, str):
+            messages.append(
+                ChatCompletionMessage(role="system", content=request.system)
+            )
+        else:
+            # List of text blocks
+            system_text = "".join(block.text for block in request.system)
+            messages.append(ChatCompletionMessage(role="system", content=system_text))
+
+    # Convert messages
+    for msg in request.messages:
+        content: str
+        if isinstance(msg.content, str):
+            content = msg.content
+        else:
+            # Concatenate text blocks (images not supported for MVP)
+            text_parts: list[str] = []
+            for block in msg.content:
+                if isinstance(block, ClaudeTextBlock):
+                    text_parts.append(block.text)
+            content = "".join(text_parts)
+
+        messages.append(ChatCompletionMessage(role=msg.role, content=content))
+
+    return ChatCompletionTaskParams(
+        model=request.model,
+        messages=messages,
+        max_tokens=request.max_tokens,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        top_k=request.top_k,
+        stop=request.stop_sequences,
+        stream=request.stream,
+    )
+
+
+def chat_response_to_claude_response(
+    response: ChatCompletionResponse,
+) -> ClaudeMessagesResponse:
+    """Convert internal ChatCompletionResponse to Claude Messages API response."""
+    content_text = ""
+    stop_reason: ClaudeStopReason | None = None
+
+    if response.choices:
+        choice = response.choices[0]
+        if isinstance(choice, ChatCompletionChoice) and choice.message.content:
+            content_text = (
+                choice.message.content
+                if isinstance(choice.message.content, str)
+                else str(choice.message.content)
+            )
+        stop_reason = finish_reason_to_claude_stop_reason(choice.finish_reason)
+
+    # Use actual usage data from response if available
+    input_tokens = response.usage.prompt_tokens if response.usage else 0
+    output_tokens = response.usage.completion_tokens if response.usage else 0
+
+    return ClaudeMessagesResponse(
+        id=f"msg_{response.id}",
+        model=response.model,
+        content=[ClaudeTextBlock(text=content_text)],
+        stop_reason=stop_reason,
+        usage=ClaudeUsage(
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+        ),
+    )
+
+
+async def generate_claude_stream(
+    command_id: CommandId,
+    model: str,
+    chunk_stream: AsyncGenerator[TokenChunk, None],
+) -> AsyncGenerator[str, None]:
+    """Generate Claude Messages API streaming events from TokenChunks."""
+    # Initial message_start event
+    initial_message = ClaudeMessageStart(
+        id=f"msg_{command_id}",
+        model=model,
+        content=[],
+        stop_reason=None,
+        usage=ClaudeUsage(input_tokens=0, output_tokens=0),
+    )
+    start_event = ClaudeMessageStartEvent(message=initial_message)
+    yield f"event: message_start\ndata: {start_event.model_dump_json()}\n\n"
+
+    # content_block_start
+    block_start = ClaudeContentBlockStartEvent(
+        index=0, content_block=ClaudeTextBlock(text="")
+    )
+    yield f"event: content_block_start\ndata: {block_start.model_dump_json()}\n\n"
+
+    output_tokens = 0
+    stop_reason: ClaudeStopReason | None = None
+    last_stats = None
+
+    async for chunk in chunk_stream:
+        output_tokens += 1  # Count each chunk as one token
+        last_stats = chunk.stats or last_stats
+
+        # content_block_delta
+        delta_event = ClaudeContentBlockDeltaEvent(
+            index=0,
+            delta=ClaudeTextDelta(text=chunk.text),
+        )
+        yield f"event: content_block_delta\ndata: {delta_event.model_dump_json()}\n\n"
+
+        if chunk.finish_reason is not None:
+            stop_reason = finish_reason_to_claude_stop_reason(chunk.finish_reason)
+
+    # Use actual token count from stats if available
+    if last_stats is not None:
+        output_tokens = last_stats.generation_tokens
+
+    # content_block_stop
+    block_stop = ClaudeContentBlockStopEvent(index=0)
+    yield f"event: content_block_stop\ndata: {block_stop.model_dump_json()}\n\n"
+
+    # message_delta
+    message_delta = ClaudeMessageDeltaEvent(
+        delta=ClaudeMessageDelta(stop_reason=stop_reason),
+        usage=ClaudeMessageDeltaUsage(output_tokens=output_tokens),
+    )
+    yield f"event: message_delta\ndata: {message_delta.model_dump_json()}\n\n"
+
+    # message_stop
+    message_stop = ClaudeMessageStopEvent()
+    yield f"event: message_stop\ndata: {message_stop.model_dump_json()}\n\n"
--- a/src/exo/master/adapters/responses.py
+++ b/src/exo/master/adapters/responses.py
@@ -0,0 +1,199 @@
+"""OpenAI Responses API adapter for converting requests/responses."""
+
+from collections.abc import AsyncGenerator
+
+from exo.shared.types.api import (
+    ChatCompletionChoice,
+    ChatCompletionMessage,
+    ChatCompletionResponse,
+)
+from exo.shared.types.chunks import TokenChunk
+from exo.shared.types.common import CommandId
+from exo.shared.types.openai_responses import (
+    ResponseCompletedEvent,
+    ResponseContentPartAddedEvent,
+    ResponseContentPartDoneEvent,
+    ResponseCreatedEvent,
+    ResponseInProgressEvent,
+    ResponseMessageItem,
+    ResponseOutputItemAddedEvent,
+    ResponseOutputItemDoneEvent,
+    ResponseOutputText,
+    ResponsesRequest,
+    ResponsesResponse,
+    ResponseTextDeltaEvent,
+    ResponseTextDoneEvent,
+    ResponseUsage,
+)
+from exo.shared.types.tasks import ChatCompletionTaskParams
+
+
+def responses_request_to_chat_params(
+    request: ResponsesRequest,
+) -> ChatCompletionTaskParams:
+    """Convert OpenAI Responses API request to internal ChatCompletionTaskParams."""
+    messages: list[ChatCompletionMessage] = []
+
+    # Add instructions as system message if present
+    if request.instructions:
+        messages.append(
+            ChatCompletionMessage(role="system", content=request.instructions)
+        )
+
+    # Convert input to messages
+    if isinstance(request.input, str):
+        messages.append(ChatCompletionMessage(role="user", content=request.input))
+    else:
+        for msg in request.input:
+            messages.append(
+                ChatCompletionMessage(
+                    role=msg.role,
+                    content=msg.content,
+                )
+            )
+
+    return ChatCompletionTaskParams(
+        model=request.model,
+        messages=messages,
+        max_tokens=request.max_output_tokens,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        stream=request.stream,
+    )
+
+
+def chat_response_to_responses_response(
+    response: ChatCompletionResponse,
+) -> ResponsesResponse:
+    """Convert internal ChatCompletionResponse to OpenAI Responses API response."""
+    output_text = ""
+
+    if response.choices:
+        choice = response.choices[0]
+        if isinstance(choice, ChatCompletionChoice) and choice.message.content:
+            output_text = (
+                choice.message.content
+                if isinstance(choice.message.content, str)
+                else str(choice.message.content)
+            )
+
+    item_id = f"item_{response.id}"
+    output_item = ResponseMessageItem(
+        id=item_id,
+        content=[ResponseOutputText(text=output_text)],
+    )
+
+    usage = None
+    if response.usage:
+        usage = ResponseUsage(
+            input_tokens=response.usage.prompt_tokens,
+            output_tokens=response.usage.completion_tokens,
+            total_tokens=response.usage.total_tokens,
+        )
+
+    return ResponsesResponse(
+        id=f"resp_{response.id}",
+        model=response.model,
+        output=[output_item],
+        output_text=output_text,
+        usage=usage,
+    )
+
+
+async def generate_responses_stream(
+    command_id: CommandId,
+    model: str,
+    chunk_stream: AsyncGenerator[TokenChunk, None],
+) -> AsyncGenerator[str, None]:
+    """Generate OpenAI Responses API streaming events from TokenChunks."""
+    response_id = f"resp_{command_id}"
+    item_id = f"item_{command_id}"
+
+    # response.created
+    initial_response = ResponsesResponse(
+        id=response_id,
+        model=model,
+        status="in_progress",
+        output=[],
+        output_text="",
+    )
+    created_event = ResponseCreatedEvent(response=initial_response)
+    yield f"event: response.created\ndata: {created_event.model_dump_json()}\n\n"
+
+    # response.in_progress
+    in_progress_event = ResponseInProgressEvent(response=initial_response)
+    yield f"event: response.in_progress\ndata: {in_progress_event.model_dump_json()}\n\n"
+
+    # response.output_item.added
+    initial_item = ResponseMessageItem(
+        id=item_id,
+        content=[ResponseOutputText(text="")],
+        status="in_progress",
+    )
+    item_added = ResponseOutputItemAddedEvent(output_index=0, item=initial_item)
+    yield f"event: response.output_item.added\ndata: {item_added.model_dump_json()}\n\n"
+
+    # response.content_part.added
+    initial_part = ResponseOutputText(text="")
+    part_added = ResponseContentPartAddedEvent(
+        output_index=0, content_index=0, part=initial_part
+    )
+    yield f"event: response.content_part.added\ndata: {part_added.model_dump_json()}\n\n"
+
+    accumulated_text = ""
+    last_stats = None
+
+    async for chunk in chunk_stream:
+        accumulated_text += chunk.text
+        last_stats = chunk.stats or last_stats
+
+        # response.output_text.delta
+        delta_event = ResponseTextDeltaEvent(
+            output_index=0,
+            content_index=0,
+            delta=chunk.text,
+        )
+        yield f"event: response.output_text.delta\ndata: {delta_event.model_dump_json()}\n\n"
+
+    # response.output_text.done
+    text_done = ResponseTextDoneEvent(
+        output_index=0, content_index=0, text=accumulated_text
+    )
+    yield f"event: response.output_text.done\ndata: {text_done.model_dump_json()}\n\n"
+
+    # response.content_part.done
+    final_part = ResponseOutputText(text=accumulated_text)
+    part_done = ResponseContentPartDoneEvent(
+        output_index=0, content_index=0, part=final_part
+    )
+    yield f"event: response.content_part.done\ndata: {part_done.model_dump_json()}\n\n"
+
+    # response.output_item.done
+    final_item = ResponseMessageItem(
+        id=item_id,
+        content=[ResponseOutputText(text=accumulated_text)],
+        status="completed",
+    )
+    item_done = ResponseOutputItemDoneEvent(output_index=0, item=final_item)
+    yield f"event: response.output_item.done\ndata: {item_done.model_dump_json()}\n\n"
+
+    # Create usage from stats if available
+    usage = None
+    if last_stats is not None:
+        usage = ResponseUsage(
+            input_tokens=last_stats.prompt_tokens,
+            output_tokens=last_stats.generation_tokens,
+            total_tokens=last_stats.prompt_tokens + last_stats.generation_tokens,
+        )
+
+    # response.completed
+    final_response = ResponsesResponse(
+        id=response_id,
+        model=model,
+        status="completed",
+        output=[final_item],
+        output_text=accumulated_text,
+        usage=usage,
+    )
+    completed_event = ResponseCompletedEvent(response=final_response)
+    yield f"event: response.completed\ndata: {completed_event.model_dump_json()}\n\n"
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -1,5 +1,6 @@
 import time
 from collections.abc import AsyncGenerator
+from dataclasses import dataclass
 from typing import cast

 import anyio
@@ -13,13 +14,17 @@ from hypercorn.asyncio import serve  # pyright: ignore[reportUnknownVariableType
 from hypercorn.config import Config
 from hypercorn.typing import ASGIFramework
 from loguru import logger
-from openai_harmony import (  # pyright: ignore[reportMissingTypeStubs]
-    HarmonyEncodingName,
-    Role,
-    StreamableParser,
-    load_harmony_encoding,
-)

+from exo.master.adapters.claude import (
+    chat_response_to_claude_response,
+    claude_request_to_chat_params,
+    generate_claude_stream,
+)
+from exo.master.adapters.responses import (
+    chat_response_to_responses_response,
+    generate_responses_stream,
+    responses_request_to_chat_params,
+)
 from exo.master.placement import place_instance as get_instance_placements
 from exo.shared.apply import apply
 from exo.shared.election import ElectionMessage
@@ -37,6 +42,8 @@ from exo.shared.types.api import (
    DeleteInstanceResponse,
    FinishReason,
    GenerationStats,
+    Logprobs,
+    LogprobsContentItem,
    ModelList,
    ModelListModel,
    PlaceInstanceParams,
@@ -45,6 +52,10 @@ from exo.shared.types.api import (
    StreamingChoiceResponse,
 )
 from exo.shared.types.chunks import TokenChunk
+from exo.shared.types.claude_api import (
+    ClaudeMessagesRequest,
+    ClaudeMessagesResponse,
+)
 from exo.shared.types.commands import (
    ChatCompletion,
    Command,
@@ -55,9 +66,19 @@ from exo.shared.types.commands import (
    TaskFinished,
 )
 from exo.shared.types.common import CommandId, NodeId, SessionId
-from exo.shared.types.events import ChunkGenerated, Event, ForwarderEvent, IndexedEvent
+from exo.shared.types.events import (
+    ChunkGenerated,
+    Event,
+    ForwarderEvent,
+    IndexedEvent,
+    PrefillProgress,
+)
 from exo.shared.types.memory import Memory
 from exo.shared.types.models import ModelId, ModelMetadata
+from exo.shared.types.openai_responses import (
+    ResponsesRequest,
+    ResponsesResponse,
+)
 from exo.shared.types.state import State
 from exo.shared.types.tasks import ChatCompletionTaskParams
 from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
@@ -67,12 +88,36 @@ from exo.utils.channels import Receiver, Sender, channel
 from exo.utils.dashboard_path import find_dashboard
 from exo.utils.event_buffer import OrderedBuffer

-encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+
+@dataclass
+class PrefillProgressData:
+    """Data class for prefill progress events."""
+
+    processed_tokens: int
+    total_tokens: int
+
+
+# Union type for stream events
+StreamEvent = TokenChunk | PrefillProgressData


 def chunk_to_response(
    chunk: TokenChunk, command_id: CommandId
 ) -> ChatCompletionResponse:
+    # Build logprobs if available
+    logprobs: Logprobs | None = None
+    if chunk.logprob is not None:
+        logprobs = Logprobs(
+            content=[
+                LogprobsContentItem(
+                    token=chunk.text,
+                    logprob=chunk.logprob,
+                    bytes=list(chunk.text.encode("utf-8")),
+                    top_logprobs=chunk.top_logprobs or [],
+                )
+            ]
+        )
+
    return ChatCompletionResponse(
        id=command_id,
        created=int(time.time()),
@@ -81,6 +126,7 @@ def chunk_to_response(
            StreamingChoiceResponse(
                index=0,
                delta=ChatCompletionMessage(role="assistant", content=chunk.text),
+                logprobs=logprobs,
                finish_reason=chunk.finish_reason,
            )
        ],
@@ -135,7 +181,7 @@ class API:
            name="dashboard",
        )

-        self._chat_completion_queues: dict[CommandId, Sender[TokenChunk]] = {}
+        self._chat_completion_queues: dict[CommandId, Sender[StreamEvent]] = {}
        self._tg: TaskGroup | None = None

    def reset(self, new_session_id: SessionId, result_clock: int):
@@ -176,6 +222,8 @@ class API:
            self.chat_completions
        )
        self.app.post("/bench/chat/completions")(self.bench_chat_completions)
+        self.app.post("/v1/messages", response_model=None)(self.claude_messages)
+        self.app.post("/v1/responses", response_model=None)(self.openai_responses)
        self.app.get("/state")(lambda: self.state)
        self.app.get("/events")(lambda: self._event_log)

@@ -381,52 +429,19 @@ class API:
            instance_id=instance_id,
        )

-    async def _process_gpt_oss(self, token_chunks: Receiver[TokenChunk]):
-        stream = StreamableParser(encoding, role=Role.ASSISTANT)
-        thinking = False
-
-        async for chunk in token_chunks:
-            stream.process(chunk.token_id)
-
-            delta = stream.last_content_delta
-            ch = stream.current_channel
-
-            if ch == "analysis" and not thinking:
-                thinking = True
-                yield chunk.model_copy(update={"text": "<think>"})
-
-            if ch != "analysis" and thinking:
-                thinking = False
-                yield chunk.model_copy(update={"text": "</think>"})
-
-            if delta:
-                yield chunk.model_copy(update={"text": delta})
-
-            if chunk.finish_reason is not None:
-                if thinking:
-                    yield chunk.model_copy(update={"text": "</think>"})
-                yield chunk
-                break
-
-    async def _chat_chunk_stream(
-        self, command_id: CommandId, parse_gpt_oss: bool
-    ) -> AsyncGenerator[TokenChunk, None]:
-        """Yield `TokenChunk`s for a given command until completion."""
+    async def _stream_events(
+        self, command_id: CommandId
+    ) -> AsyncGenerator[StreamEvent, None]:
+        """Yield stream events (TokenChunks or PrefillProgressData) for a command."""

        try:
-            self._chat_completion_queues[command_id], recv = channel[TokenChunk]()
+            self._chat_completion_queues[command_id], recv = channel[StreamEvent]()

-            with recv as token_chunks:
-                if parse_gpt_oss:
-                    async for chunk in self._process_gpt_oss(token_chunks):
-                        yield chunk
-                        if chunk.finish_reason is not None:
-                            break
-                else:
-                    async for chunk in token_chunks:
-                        yield chunk
-                        if chunk.finish_reason is not None:
-                            break
+            with recv as events:
+                async for event in events:
+                    yield event
+                    if isinstance(event, TokenChunk) and event.finish_reason is not None:
+                        break

        except anyio.get_cancelled_exc_class():
            # TODO: TaskCancelled
@@ -441,24 +456,39 @@ class API:
            await self._send(command)
            del self._chat_completion_queues[command_id]

+    async def _chat_chunk_stream(
+        self, command_id: CommandId
+    ) -> AsyncGenerator[TokenChunk, None]:
+        """Yield only TokenChunks, filtering out progress events."""
+
+        async for event in self._stream_events(command_id):
+            if isinstance(event, TokenChunk):
+                yield event
+
    async def _generate_chat_stream(
-        self, command_id: CommandId, parse_gpt_oss: bool
+        self, command_id: CommandId
    ) -> AsyncGenerator[str, None]:
        """Generate chat completion stream as JSON strings."""

-        async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
-            chunk_response: ChatCompletionResponse = chunk_to_response(
-                chunk, command_id
-            )
-            logger.debug(f"chunk_response: {chunk_response}")
+        async for event in self._stream_events(command_id):
+            if isinstance(event, PrefillProgressData):
+                # Send prefill progress as a named SSE event
+                progress_json = f'{{"processed":{event.processed_tokens},"total":{event.total_tokens}}}'
+                yield f"event: prefill_progress\ndata: {progress_json}\n\n"
+            else:
+                # TokenChunk - regular token generation
+                chunk_response: ChatCompletionResponse = chunk_to_response(
+                    event, command_id
+                )
+                logger.debug(f"chunk_response: {chunk_response}")

-            yield f"data: {chunk_response.model_dump_json()}\n\n"
+                yield f"data: {chunk_response.model_dump_json()}\n\n"

-            if chunk.finish_reason is not None:
-                yield "data: [DONE]\n\n"
+                if event.finish_reason is not None:
+                    yield "data: [DONE]\n\n"

    async def _collect_chat_completion(
-        self, command_id: CommandId, parse_gpt_oss: bool
+        self, command_id: CommandId
    ) -> ChatCompletionResponse:
        """Collect all token chunks for a chat completion and return a single response."""

@@ -466,7 +496,7 @@ class API:
        model: str | None = None
        finish_reason: FinishReason | None = None

-        async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
+        async for chunk in self._chat_chunk_stream(command_id):
            if model is None:
                model = chunk.model

@@ -495,7 +525,7 @@ class API:
        )

    async def _collect_chat_completion_with_stats(
-        self, command_id: CommandId, parse_gpt_oss: bool
+        self, command_id: CommandId
    ) -> BenchChatCompletionResponse:
        text_parts: list[str] = []
        model: str | None = None
@@ -503,7 +533,7 @@ class API:

        stats: GenerationStats | None = None

-        async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
+        async for chunk in self._chat_chunk_stream(command_id):
            if model is None:
                model = chunk.model

@@ -544,8 +574,6 @@ class API:
        """Handle chat completions, supporting both streaming and non-streaming responses."""
        model_meta = await resolve_model_meta(payload.model)
        payload.model = model_meta.model_id
-        parse_gpt_oss = "gpt-oss" in model_meta.model_id.lower()
-        logger.info(f"{parse_gpt_oss=}")

        if not any(
            instance.shard_assignments.model_id == payload.model
@@ -562,17 +590,16 @@ class API:
        await self._send(command)
        if payload.stream:
            return StreamingResponse(
-                self._generate_chat_stream(command.command_id, parse_gpt_oss),
+                self._generate_chat_stream(command.command_id),
                media_type="text/event-stream",
            )

-        return await self._collect_chat_completion(command.command_id, parse_gpt_oss)
+        return await self._collect_chat_completion(command.command_id)

    async def bench_chat_completions(
        self, payload: BenchChatCompletionTaskParams
    ) -> BenchChatCompletionResponse:
        model_meta = await resolve_model_meta(payload.model)
-        parse_gpt_oss = "gpt-oss" in model_meta.model_id.lower()
        payload.model = model_meta.model_id

        if not any(
@@ -589,12 +616,78 @@ class API:
        command = ChatCompletion(request_params=payload)
        await self._send(command)

-        response = await self._collect_chat_completion_with_stats(
-            command.command_id,
-            parse_gpt_oss,
-        )
+        response = await self._collect_chat_completion_with_stats(command.command_id)
        return response

+    async def claude_messages(
+        self, payload: ClaudeMessagesRequest
+    ) -> ClaudeMessagesResponse | StreamingResponse:
+        """Handle Claude Messages API requests."""
+        chat_params = claude_request_to_chat_params(payload)
+        model_meta = await resolve_model_meta(chat_params.model)
+        chat_params.model = model_meta.model_id
+
+        if not any(
+            instance.shard_assignments.model_id == chat_params.model
+            for instance in self.state.instances.values()
+        ):
+            await self._trigger_notify_user_to_download_model(chat_params.model)
+            raise HTTPException(
+                status_code=404,
+                detail=f"No instance found for model {chat_params.model}",
+            )
+
+        command = ChatCompletion(request_params=chat_params)
+        await self._send(command)
+
+        if payload.stream:
+            return StreamingResponse(
+                generate_claude_stream(
+                    command.command_id,
+                    payload.model,
+                    self._chat_chunk_stream(command.command_id),
+                ),
+                media_type="text/event-stream",
+            )
+
+        response = await self._collect_chat_completion(command.command_id)
+        return chat_response_to_claude_response(response)
+
+    async def openai_responses(
+        self, payload: ResponsesRequest
+    ) -> ResponsesResponse | StreamingResponse:
+        """Handle OpenAI Responses API requests."""
+        chat_params = responses_request_to_chat_params(payload)
+
+        model_meta = await resolve_model_meta(chat_params.model)
+        chat_params.model = model_meta.model_id
+
+        if not any(
+            instance.shard_assignments.model_id == chat_params.model
+            for instance in self.state.instances.values()
+        ):
+            await self._trigger_notify_user_to_download_model(chat_params.model)
+            raise HTTPException(
+                status_code=404,
+                detail=f"No instance found for model {chat_params.model}",
+            )
+
+        command = ChatCompletion(request_params=chat_params)
+        await self._send(command)
+
+        if payload.stream:
+            return StreamingResponse(
+                generate_responses_stream(
+                    command.command_id,
+                    payload.model,
+                    self._chat_chunk_stream(command.command_id),
+                ),
+                media_type="text/event-stream",
+            )
+
+        response = await self._collect_chat_completion(command.command_id)
+        return chat_response_to_responses_response(response)
+
    def _calculate_total_available_memory(self) -> Memory:
        """Calculate total available memory across all nodes in bytes."""
        total_available = Memory()
@@ -662,6 +755,16 @@ class API:
                        await self._chat_completion_queues[event.command_id].send(
                            event.chunk
                        )
+                    elif (
+                        isinstance(event, PrefillProgress)
+                        and event.command_id in self._chat_completion_queues
+                    ):
+                        await self._chat_completion_queues[event.command_id].send(
+                            PrefillProgressData(
+                                processed_tokens=event.processed_tokens,
+                                total_tokens=event.total_tokens,
+                            )
+                        )

    async def _pause_on_new_election(self):
        with self.election_receiver as ems:
--- a/src/exo/master/tests/test_claude_api.py
+++ b/src/exo/master/tests/test_claude_api.py
@@ -0,0 +1,392 @@
+"""Tests for Claude Messages API conversion functions and types."""
+
+import json
+from typing import Any, cast
+
+import pydantic
+import pytest
+
+from exo.master.adapters.claude import (
+    chat_response_to_claude_response,
+    claude_request_to_chat_params,
+    finish_reason_to_claude_stop_reason,
+)
+from exo.shared.types.api import (
+    ChatCompletionChoice,
+    ChatCompletionMessage,
+    ChatCompletionResponse,
+    Usage,
+)
+from exo.shared.types.claude_api import (
+    ClaudeContentBlockDeltaEvent,
+    ClaudeContentBlockStartEvent,
+    ClaudeContentBlockStopEvent,
+    ClaudeMessage,
+    ClaudeMessageDelta,
+    ClaudeMessageDeltaEvent,
+    ClaudeMessageDeltaUsage,
+    ClaudeMessagesRequest,
+    ClaudeMessageStart,
+    ClaudeMessageStartEvent,
+    ClaudeMessageStopEvent,
+    ClaudeTextBlock,
+    ClaudeTextDelta,
+    ClaudeUsage,
+)
+
+
+class TestFinishReasonToClaudeStopReason:
+    """Tests for finish_reason to Claude stop_reason mapping."""
+
+    def test_stop_maps_to_end_turn(self):
+        assert finish_reason_to_claude_stop_reason("stop") == "end_turn"
+
+    def test_length_maps_to_max_tokens(self):
+        assert finish_reason_to_claude_stop_reason("length") == "max_tokens"
+
+    def test_tool_calls_maps_to_tool_use(self):
+        assert finish_reason_to_claude_stop_reason("tool_calls") == "tool_use"
+
+    def test_function_call_maps_to_tool_use(self):
+        assert finish_reason_to_claude_stop_reason("function_call") == "tool_use"
+
+    def test_content_filter_maps_to_end_turn(self):
+        assert finish_reason_to_claude_stop_reason("content_filter") == "end_turn"
+
+    def test_none_returns_none(self):
+        assert finish_reason_to_claude_stop_reason(None) is None
+
+
+class TestClaudeRequestToChatParams:
+    """Tests for converting Claude Messages API requests to ChatCompletionTaskParams."""
+
+    def test_basic_request_conversion(self):
+        request = ClaudeMessagesRequest(
+            model="claude-3-opus",
+            max_tokens=100,
+            messages=[
+                ClaudeMessage(role="user", content="Hello"),
+            ],
+        )
+        params = claude_request_to_chat_params(request)
+
+        assert params.model == "claude-3-opus"
+        assert params.max_tokens == 100
+        assert len(params.messages) == 1
+        assert params.messages[0].role == "user"
+        assert params.messages[0].content == "Hello"
+
+    def test_request_with_system_string(self):
+        request = ClaudeMessagesRequest(
+            model="claude-3-opus",
+            max_tokens=100,
+            system="You are a helpful assistant.",
+            messages=[
+                ClaudeMessage(role="user", content="Hello"),
+            ],
+        )
+        params = claude_request_to_chat_params(request)
+
+        assert len(params.messages) == 2
+        assert params.messages[0].role == "system"
+        assert params.messages[0].content == "You are a helpful assistant."
+        assert params.messages[1].role == "user"
+        assert params.messages[1].content == "Hello"
+
+    def test_request_with_system_text_blocks(self):
+        request = ClaudeMessagesRequest(
+            model="claude-3-opus",
+            max_tokens=100,
+            system=[
+                ClaudeTextBlock(text="You are helpful. "),
+                ClaudeTextBlock(text="Be concise."),
+            ],
+            messages=[
+                ClaudeMessage(role="user", content="Hello"),
+            ],
+        )
+        params = claude_request_to_chat_params(request)
+
+        assert len(params.messages) == 2
+        assert params.messages[0].role == "system"
+        assert params.messages[0].content == "You are helpful. Be concise."
+
+    def test_request_with_content_blocks(self):
+        request = ClaudeMessagesRequest(
+            model="claude-3-opus",
+            max_tokens=100,
+            messages=[
+                ClaudeMessage(
+                    role="user",
+                    content=[
+                        ClaudeTextBlock(text="First part. "),
+                        ClaudeTextBlock(text="Second part."),
+                    ],
+                ),
+            ],
+        )
+        params = claude_request_to_chat_params(request)
+
+        assert len(params.messages) == 1
+        assert params.messages[0].content == "First part. Second part."
+
+    def test_request_with_multi_turn_conversation(self):
+        request = ClaudeMessagesRequest(
+            model="claude-3-opus",
+            max_tokens=100,
+            messages=[
+                ClaudeMessage(role="user", content="Hello"),
+                ClaudeMessage(role="assistant", content="Hi there!"),
+                ClaudeMessage(role="user", content="How are you?"),
+            ],
+        )
+        params = claude_request_to_chat_params(request)
+
+        assert len(params.messages) == 3
+        assert params.messages[0].role == "user"
+        assert params.messages[1].role == "assistant"
+        assert params.messages[2].role == "user"
+
+    def test_request_with_optional_parameters(self):
+        request = ClaudeMessagesRequest(
+            model="claude-3-opus",
+            max_tokens=100,
+            messages=[ClaudeMessage(role="user", content="Hello")],
+            temperature=0.7,
+            top_p=0.9,
+            top_k=40,
+            stop_sequences=["STOP", "END"],
+            stream=True,
+        )
+        params = claude_request_to_chat_params(request)
+
+        assert params.temperature == 0.7
+        assert params.top_p == 0.9
+        assert params.top_k == 40
+        assert params.stop == ["STOP", "END"]
+        assert params.stream is True
+
+
+class TestChatResponseToClaudeResponse:
+    """Tests for converting ChatCompletionResponse to Claude Messages API response."""
+
+    def test_basic_response_conversion(self):
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            created=1234567890,
+            model="llama-3.2-1b",
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatCompletionMessage(
+                        role="assistant",
+                        content="Hello! How can I help you?",
+                    ),
+                    finish_reason="stop",
+                )
+            ],
+            usage=Usage(prompt_tokens=10, completion_tokens=7, total_tokens=17),
+        )
+        claude_response = chat_response_to_claude_response(response)
+
+        assert claude_response.id == "msg_chatcmpl-123"
+        assert claude_response.model == "llama-3.2-1b"
+        assert claude_response.role == "assistant"
+        assert claude_response.type == "message"
+        assert len(claude_response.content) == 1
+        assert claude_response.content[0].type == "text"
+        assert claude_response.content[0].text == "Hello! How can I help you?"
+        assert claude_response.stop_reason == "end_turn"
+        assert claude_response.usage.input_tokens == 10
+        assert claude_response.usage.output_tokens == 7
+
+    def test_response_with_length_finish_reason(self):
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            created=1234567890,
+            model="llama-3.2-1b",
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatCompletionMessage(
+                        role="assistant", content="Truncated..."
+                    ),
+                    finish_reason="length",
+                )
+            ],
+        )
+        claude_response = chat_response_to_claude_response(response)
+
+        assert claude_response.stop_reason == "max_tokens"
+
+    def test_response_with_empty_content(self):
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            created=1234567890,
+            model="llama-3.2-1b",
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatCompletionMessage(role="assistant", content=""),
+                    finish_reason="stop",
+                )
+            ],
+            usage=Usage(prompt_tokens=10, completion_tokens=0, total_tokens=10),
+        )
+        claude_response = chat_response_to_claude_response(response)
+
+        assert claude_response.content[0].text == ""
+        assert claude_response.usage.output_tokens == 0
+
+    def test_response_with_no_choices(self):
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            created=1234567890,
+            model="llama-3.2-1b",
+            choices=[],
+        )
+        claude_response = chat_response_to_claude_response(response)
+
+        assert claude_response.content[0].text == ""
+        assert claude_response.stop_reason is None
+        assert claude_response.usage.input_tokens == 0
+        assert claude_response.usage.output_tokens == 0
+
+    def test_response_without_usage(self):
+        """Test response conversion when usage data is not available."""
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            created=1234567890,
+            model="llama-3.2-1b",
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatCompletionMessage(role="assistant", content="Hello!"),
+                    finish_reason="stop",
+                )
+            ],
+        )
+        claude_response = chat_response_to_claude_response(response)
+
+        assert claude_response.content[0].text == "Hello!"
+        assert claude_response.usage.input_tokens == 0
+        assert claude_response.usage.output_tokens == 0
+
+
+class TestClaudeMessagesRequestValidation:
+    """Tests for Claude Messages API request validation."""
+
+    def test_request_requires_model(self):
+        with pytest.raises(pydantic.ValidationError):
+            ClaudeMessagesRequest.model_validate(
+                {
+                    "max_tokens": 100,
+                    "messages": [{"role": "user", "content": "Hello"}],
+                }
+            )
+
+    def test_request_requires_max_tokens(self):
+        with pytest.raises(pydantic.ValidationError):
+            ClaudeMessagesRequest.model_validate(
+                {
+                    "model": "claude-3-opus",
+                    "messages": [{"role": "user", "content": "Hello"}],
+                }
+            )
+
+    def test_request_requires_messages(self):
+        with pytest.raises(pydantic.ValidationError):
+            ClaudeMessagesRequest.model_validate(
+                {
+                    "model": "claude-3-opus",
+                    "max_tokens": 100,
+                }
+            )
+
+
+class TestClaudeStreamingEvents:
+    """Tests for Claude Messages API streaming event serialization."""
+
+    def test_message_start_event_format(self):
+        message = ClaudeMessageStart(
+            id="msg_123",
+            model="claude-3-opus",
+            content=[],
+            stop_reason=None,
+            usage=ClaudeUsage(input_tokens=10, output_tokens=0),
+        )
+        event = ClaudeMessageStartEvent(message=message)
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "message_start"
+        assert parsed["message"]["id"] == "msg_123"
+        assert parsed["message"]["type"] == "message"
+        assert parsed["message"]["role"] == "assistant"
+        assert parsed["message"]["model"] == "claude-3-opus"
+
+    def test_content_block_start_event_format(self):
+        event = ClaudeContentBlockStartEvent(
+            index=0,
+            content_block=ClaudeTextBlock(text=""),
+        )
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "content_block_start"
+        assert parsed["index"] == 0
+        assert parsed["content_block"]["type"] == "text"
+        assert parsed["content_block"]["text"] == ""
+
+    def test_content_block_delta_event_format(self):
+        event = ClaudeContentBlockDeltaEvent(
+            index=0,
+            delta=ClaudeTextDelta(text="Hello"),
+        )
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "content_block_delta"
+        assert parsed["index"] == 0
+        assert parsed["delta"]["type"] == "text_delta"
+        assert parsed["delta"]["text"] == "Hello"
+
+    def test_content_block_stop_event_format(self):
+        event = ClaudeContentBlockStopEvent(index=0)
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "content_block_stop"
+        assert parsed["index"] == 0
+
+    def test_message_delta_event_format(self):
+        event = ClaudeMessageDeltaEvent(
+            delta=ClaudeMessageDelta(stop_reason="end_turn"),
+            usage=ClaudeMessageDeltaUsage(output_tokens=25),
+        )
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "message_delta"
+        assert parsed["delta"]["stop_reason"] == "end_turn"
+        assert parsed["usage"]["output_tokens"] == 25
+
+    def test_message_stop_event_format(self):
+        event = ClaudeMessageStopEvent()
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "message_stop"
+
+    def test_sse_format(self):
+        """Test that SSE format is correctly generated."""
+        event = ClaudeContentBlockDeltaEvent(
+            index=0,
+            delta=ClaudeTextDelta(text="Hello"),
+        )
+        # Simulate the SSE format used in the streaming generator
+        sse_line = f"event: content_block_delta\ndata: {event.model_dump_json()}\n\n"
+
+        assert sse_line.startswith("event: content_block_delta\n")
+        assert "data: " in sse_line
+        assert sse_line.endswith("\n\n")
--- a/src/exo/master/tests/test_openai_responses_api.py
+++ b/src/exo/master/tests/test_openai_responses_api.py
@@ -0,0 +1,414 @@
+"""Tests for OpenAI Responses API conversion functions and types."""
+
+import json
+from typing import Any, cast
+
+import pydantic
+import pytest
+
+from exo.master.adapters.responses import (
+    chat_response_to_responses_response,
+    responses_request_to_chat_params,
+)
+from exo.shared.types.api import (
+    ChatCompletionChoice,
+    ChatCompletionMessage,
+    ChatCompletionResponse,
+    Usage,
+)
+from exo.shared.types.openai_responses import (
+    ResponseCompletedEvent,
+    ResponseContentPartAddedEvent,
+    ResponseCreatedEvent,
+    ResponseInputMessage,
+    ResponseMessageItem,
+    ResponseOutputItemAddedEvent,
+    ResponseOutputItemDoneEvent,
+    ResponseOutputText,
+    ResponsesRequest,
+    ResponsesResponse,
+    ResponseTextDeltaEvent,
+    ResponseTextDoneEvent,
+    ResponseUsage,
+)
+
+
+class TestResponsesRequestToChatParams:
+    """Tests for converting OpenAI Responses API requests to ChatCompletionTaskParams."""
+
+    def test_string_input_conversion(self):
+        request = ResponsesRequest(
+            model="gpt-4o",
+            input="Hello, how are you?",
+        )
+        params = responses_request_to_chat_params(request)
+
+        assert params.model == "gpt-4o"
+        assert len(params.messages) == 1
+        assert params.messages[0].role == "user"
+        assert params.messages[0].content == "Hello, how are you?"
+
+    def test_message_array_input_conversion(self):
+        request = ResponsesRequest(
+            model="gpt-4o",
+            input=[
+                ResponseInputMessage(role="user", content="Hello"),
+                ResponseInputMessage(role="assistant", content="Hi there!"),
+                ResponseInputMessage(role="user", content="How are you?"),
+            ],
+        )
+        params = responses_request_to_chat_params(request)
+
+        assert len(params.messages) == 3
+        assert params.messages[0].role == "user"
+        assert params.messages[0].content == "Hello"
+        assert params.messages[1].role == "assistant"
+        assert params.messages[1].content == "Hi there!"
+        assert params.messages[2].role == "user"
+        assert params.messages[2].content == "How are you?"
+
+    def test_request_with_instructions(self):
+        request = ResponsesRequest(
+            model="gpt-4o",
+            input="Hello",
+            instructions="You are a helpful assistant. Be concise.",
+        )
+        params = responses_request_to_chat_params(request)
+
+        assert len(params.messages) == 2
+        assert params.messages[0].role == "system"
+        assert params.messages[0].content == "You are a helpful assistant. Be concise."
+        assert params.messages[1].role == "user"
+        assert params.messages[1].content == "Hello"
+
+    def test_request_with_optional_parameters(self):
+        request = ResponsesRequest(
+            model="gpt-4o",
+            input="Hello",
+            max_output_tokens=500,
+            temperature=0.8,
+            top_p=0.95,
+            stream=True,
+        )
+        params = responses_request_to_chat_params(request)
+
+        assert params.max_tokens == 500
+        assert params.temperature == 0.8
+        assert params.top_p == 0.95
+        assert params.stream is True
+
+    def test_request_with_system_role_in_messages(self):
+        request = ResponsesRequest(
+            model="gpt-4o",
+            input=[
+                ResponseInputMessage(role="system", content="Be helpful"),
+                ResponseInputMessage(role="user", content="Hello"),
+            ],
+        )
+        params = responses_request_to_chat_params(request)
+
+        assert len(params.messages) == 2
+        assert params.messages[0].role == "system"
+        assert params.messages[1].role == "user"
+
+    def test_request_with_developer_role(self):
+        request = ResponsesRequest(
+            model="gpt-4o",
+            input=[
+                ResponseInputMessage(role="developer", content="Internal note"),
+                ResponseInputMessage(role="user", content="Hello"),
+            ],
+        )
+        params = responses_request_to_chat_params(request)
+
+        assert len(params.messages) == 2
+        assert params.messages[0].role == "developer"
+
+
+class TestChatResponseToResponsesResponse:
+    """Tests for converting ChatCompletionResponse to OpenAI Responses API response."""
+
+    def test_basic_response_conversion(self):
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            created=1234567890,
+            model="llama-3.2-1b",
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatCompletionMessage(
+                        role="assistant",
+                        content="Hello! How can I help you?",
+                    ),
+                    finish_reason="stop",
+                )
+            ],
+        )
+        responses_response = chat_response_to_responses_response(response)
+
+        assert responses_response.id == "resp_chatcmpl-123"
+        assert responses_response.object == "response"
+        assert responses_response.model == "llama-3.2-1b"
+        assert responses_response.status == "completed"
+        assert responses_response.output_text == "Hello! How can I help you?"
+        assert len(responses_response.output) == 1
+        assert responses_response.output[0].type == "message"
+        assert responses_response.output[0].role == "assistant"
+        assert len(responses_response.output[0].content) == 1
+        assert responses_response.output[0].content[0].type == "output_text"
+        assert (
+            responses_response.output[0].content[0].text == "Hello! How can I help you?"
+        )
+
+    def test_response_with_usage(self):
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            created=1234567890,
+            model="llama-3.2-1b",
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatCompletionMessage(role="assistant", content="Hello!"),
+                    finish_reason="stop",
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=10,
+                completion_tokens=5,
+                total_tokens=15,
+            ),
+        )
+        responses_response = chat_response_to_responses_response(response)
+
+        assert responses_response.usage is not None
+        assert responses_response.usage.input_tokens == 10
+        assert responses_response.usage.output_tokens == 5
+        assert responses_response.usage.total_tokens == 15
+
+    def test_response_with_empty_content(self):
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            created=1234567890,
+            model="llama-3.2-1b",
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatCompletionMessage(role="assistant", content=""),
+                    finish_reason="stop",
+                )
+            ],
+        )
+        responses_response = chat_response_to_responses_response(response)
+
+        assert responses_response.output_text == ""
+        assert responses_response.output[0].content[0].text == ""
+
+    def test_response_with_no_choices(self):
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            created=1234567890,
+            model="llama-3.2-1b",
+            choices=[],
+        )
+        responses_response = chat_response_to_responses_response(response)
+
+        assert responses_response.output_text == ""
+
+    def test_response_without_usage(self):
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            created=1234567890,
+            model="llama-3.2-1b",
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatCompletionMessage(role="assistant", content="Hello!"),
+                    finish_reason="stop",
+                )
+            ],
+        )
+        responses_response = chat_response_to_responses_response(response)
+
+        assert responses_response.usage is None
+
+    def test_response_item_id_format(self):
+        response = ChatCompletionResponse(
+            id="chatcmpl-abc123",
+            created=1234567890,
+            model="llama-3.2-1b",
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatCompletionMessage(role="assistant", content="Hello!"),
+                    finish_reason="stop",
+                )
+            ],
+        )
+        responses_response = chat_response_to_responses_response(response)
+
+        assert responses_response.output[0].id == "item_chatcmpl-abc123"
+
+
+class TestResponsesRequestValidation:
+    """Tests for OpenAI Responses API request validation."""
+
+    def test_request_requires_model(self):
+        with pytest.raises(pydantic.ValidationError):
+            ResponsesRequest.model_validate(
+                {
+                    "input": "Hello",
+                }
+            )
+
+    def test_request_requires_input(self):
+        with pytest.raises(pydantic.ValidationError):
+            ResponsesRequest.model_validate(
+                {
+                    "model": "gpt-4o",
+                }
+            )
+
+    def test_request_accepts_string_input(self):
+        request = ResponsesRequest(
+            model="gpt-4o",
+            input="Hello",
+        )
+        assert request.input == "Hello"
+
+    def test_request_accepts_message_array_input(self):
+        request = ResponsesRequest(
+            model="gpt-4o",
+            input=[ResponseInputMessage(role="user", content="Hello")],
+        )
+        assert len(request.input) == 1
+
+
+class TestResponsesStreamingEvents:
+    """Tests for OpenAI Responses API streaming event serialization."""
+
+    def test_response_created_event_format(self):
+        response = ResponsesResponse(
+            id="resp_123",
+            model="gpt-4o",
+            status="in_progress",
+            output=[],
+            output_text="",
+        )
+        event = ResponseCreatedEvent(response=response)
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "response.created"
+        assert parsed["response"]["id"] == "resp_123"
+        assert parsed["response"]["object"] == "response"
+        assert parsed["response"]["status"] == "in_progress"
+
+    def test_output_item_added_event_format(self):
+        item = ResponseMessageItem(
+            id="item_123",
+            content=[ResponseOutputText(text="")],
+            status="in_progress",
+        )
+        event = ResponseOutputItemAddedEvent(output_index=0, item=item)
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "response.output_item.added"
+        assert parsed["output_index"] == 0
+        assert parsed["item"]["type"] == "message"
+        assert parsed["item"]["id"] == "item_123"
+        assert parsed["item"]["role"] == "assistant"
+
+    def test_content_part_added_event_format(self):
+        part = ResponseOutputText(text="")
+        event = ResponseContentPartAddedEvent(
+            output_index=0,
+            content_index=0,
+            part=part,
+        )
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "response.content_part.added"
+        assert parsed["output_index"] == 0
+        assert parsed["content_index"] == 0
+        assert parsed["part"]["type"] == "output_text"
+
+    def test_text_delta_event_format(self):
+        event = ResponseTextDeltaEvent(
+            output_index=0,
+            content_index=0,
+            delta="Hello",
+        )
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "response.output_text.delta"
+        assert parsed["output_index"] == 0
+        assert parsed["content_index"] == 0
+        assert parsed["delta"] == "Hello"
+
+    def test_text_done_event_format(self):
+        event = ResponseTextDoneEvent(
+            output_index=0,
+            content_index=0,
+            text="Hello, world!",
+        )
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "response.output_text.done"
+        assert parsed["text"] == "Hello, world!"
+
+    def test_output_item_done_event_format(self):
+        item = ResponseMessageItem(
+            id="item_123",
+            content=[ResponseOutputText(text="Hello, world!")],
+            status="completed",
+        )
+        event = ResponseOutputItemDoneEvent(output_index=0, item=item)
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "response.output_item.done"
+        assert parsed["item"]["status"] == "completed"
+        assert parsed["item"]["content"][0]["text"] == "Hello, world!"
+
+    def test_response_completed_event_format(self):
+        item = ResponseMessageItem(
+            id="item_123",
+            content=[ResponseOutputText(text="Hello!")],
+            status="completed",
+        )
+        response = ResponsesResponse(
+            id="resp_123",
+            model="gpt-4o",
+            status="completed",
+            output=[item],
+            output_text="Hello!",
+            usage=ResponseUsage(input_tokens=10, output_tokens=5, total_tokens=15),
+        )
+        event = ResponseCompletedEvent(response=response)
+        json_str = event.model_dump_json()
+        parsed = cast(dict[str, Any], json.loads(json_str))
+
+        assert parsed["type"] == "response.completed"
+        assert parsed["response"]["status"] == "completed"
+        assert parsed["response"]["output_text"] == "Hello!"
+        assert parsed["response"]["usage"]["total_tokens"] == 15
+
+    def test_sse_format(self):
+        """Test that SSE format is correctly generated."""
+        event = ResponseTextDeltaEvent(
+            output_index=0,
+            content_index=0,
+            delta="Hello",
+        )
+        # Simulate the SSE format used in the streaming generator
+        sse_line = (
+            f"event: response.output_text.delta\ndata: {event.model_dump_json()}\n\n"
+        )
+
+        assert sse_line.startswith("event: response.output_text.delta\n")
+        assert "data: " in sse_line
+        assert sse_line.endswith("\n\n")
--- a/src/exo/shared/apply.py
+++ b/src/exo/shared/apply.py
@@ -16,6 +16,7 @@ from exo.shared.types.events import (
    NodeMemoryMeasured,
    NodePerformanceMeasured,
    NodeTimedOut,
+    PrefillProgress,
    RunnerDeleted,
    RunnerStatusUpdated,
    TaskAcknowledged,
@@ -40,7 +41,7 @@ def event_apply(event: Event, state: State) -> State:
    """Apply an event to state."""
    match event:
        case (
-            TestEvent() | ChunkGenerated() | TaskAcknowledged()
+            TestEvent() | ChunkGenerated() | TaskAcknowledged() | PrefillProgress()
        ):  # TaskAcknowledged should never be sent by a worker but i dont mind if it just gets ignored
            return state
        case InstanceCreated():
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -14,32 +14,6 @@ class ModelCard(CamelCaseModel):

 MODEL_CARDS: dict[str, ModelCard] = {
    # deepseek v3
-    # "deepseek-v3-0324:4bit": ModelCard(
-    #     short_id="deepseek-v3-0324:4bit",
-    #     model_id="mlx-community/DeepSeek-V3-0324-4bit",
-    #     name="DeepSeek V3 0324 (4-bit)",
-    #     description="""DeepSeek V3 is a large language model trained on the DeepSeek V3 dataset.""",
-    #     tags=[],
-    #     metadata=ModelMetadata(
-    #         model_id=ModelId("mlx-community/DeepSeek-V3-0324-4bit"),
-    #         pretty_name="DeepSeek V3 0324 (4-bit)",
-    #         storage_size=Memory.from_kb(409706307),
-    #         n_layers=61,
-    #     ),
-    # ),
-    # "deepseek-v3-0324": ModelCard(
-    #     short_id="deepseek-v3-0324",
-    #     model_id="mlx-community/DeepSeek-v3-0324-8bit",
-    #     name="DeepSeek V3 0324 (8-bit)",
-    #     description="""DeepSeek V3 is a large language model trained on the DeepSeek V3 dataset.""",
-    #     tags=[],
-    #     metadata=ModelMetadata(
-    #         model_id=ModelId("mlx-community/DeepSeek-v3-0324-8bit"),
-    #         pretty_name="DeepSeek V3 0324 (8-bit)",
-    #         storage_size=Memory.from_kb(754706307),
-    #         n_layers=61,
-    #     ),
-    # ),
    "deepseek-v3.1-4bit": ModelCard(
        short_id="deepseek-v3.1-4bit",
        model_id=ModelId("mlx-community/DeepSeek-V3.1-4bit"),
@@ -70,65 +44,6 @@ MODEL_CARDS: dict[str, ModelCard] = {
            supports_tensor=True,
        ),
    ),
-    # "deepseek-v3.2": ModelCard(
-    #     short_id="deepseek-v3.2",
-    #     model_id=ModelId("mlx-community/DeepSeek-V3.2-8bit"),
-    #     name="DeepSeek V3.2 (8-bit)",
-    #     description="""DeepSeek V3.2 is a large language model trained on the DeepSeek V3.2 dataset.""",
-    #     tags=[],
-    #     metadata=ModelMetadata(
-    #         model_id=ModelId("mlx-community/DeepSeek-V3.2-8bit"),
-    #         pretty_name="DeepSeek V3.2 (8-bit)",
-    #         storage_size=Memory.from_kb(754706307),
-    #         n_layers=61,
-    #         hidden_size=7168,
-    #         supports_tensor=True,
-    #     ),
-    # ),
-    # "deepseek-v3.2-4bit": ModelCard(
-    #     short_id="deepseek-v3.2-4bit",
-    #     model_id=ModelId("mlx-community/DeepSeek-V3.2-4bit"),
-    #     name="DeepSeek V3.2 (4-bit)",
-    #     description="""DeepSeek V3.2 is a large language model trained on the DeepSeek V3.2 dataset.""",
-    #     tags=[],
-    #     metadata=ModelMetadata(
-    #         model_id=ModelId("mlx-community/DeepSeek-V3.2-4bit"),
-    #         pretty_name="DeepSeek V3.2 (4-bit)",
-    #         storage_size=Memory.from_kb(754706307 // 2),  # TODO !!!!!
-    #         n_layers=61,
-    #         hidden_size=7168,
-    #         supports_tensor=True,
-    #     ),
-    # ),
-    # deepseek r1
-    # "deepseek-r1-0528-4bit": ModelCard(
-    #     short_id="deepseek-r1-0528-4bit",
-    #     model_id="mlx-community/DeepSeek-R1-0528-4bit",
-    #     name="DeepSeek-R1-0528 (4-bit)",
-    #     description="""DeepSeek R1 is a large language model trained on the DeepSeek R1 dataset.""",
-    #     tags=[],
-    #     metadata=ModelMetadata(
-    #         model_id=ModelId("mlx-community/DeepSeek-R1-0528-4bit"),
-    #         pretty_name="DeepSeek R1 671B (4-bit)",
-    #         storage_size=Memory.from_kb(409706307),
-    #         n_layers=61,
-    #         hidden_size=7168,
-    #     ),
-    # ),
-    # "deepseek-r1-0528": ModelCard(
-    #     short_id="deepseek-r1-0528",
-    #     model_id="mlx-community/DeepSeek-R1-0528-8bit",
-    #     name="DeepSeek-R1-0528 (8-bit)",
-    #     description="""DeepSeek R1 is a large language model trained on the DeepSeek R1 dataset.""",
-    #     tags=[],
-    #     metadata=ModelMetadata(
-    #         model_id=ModelId("mlx-community/DeepSeek-R1-0528-8bit"),
-    #         pretty_name="DeepSeek R1 671B (8-bit)",
-    #         storage_size=Memory.from_bytes(754998771712),
-    #         n_layers=61,
-    # .       hidden_size=7168,
-    #     ),
-    # ),
    # kimi k2
    "kimi-k2-instruct-4bit": ModelCard(
        short_id="kimi-k2-instruct-4bit",
@@ -510,23 +425,24 @@ MODEL_CARDS: dict[str, ModelCard] = {
            supports_tensor=True,
        ),
    ),
-    "gpt-oss-20b-4bit": ModelCard(
-        short_id="gpt-oss-20b-4bit",
-        model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q4"),
-        name="GPT-OSS 20B (MXFP4-Q4, MLX)",
-        description="""OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this MLX variant uses MXFP4 4-bit quantization.""",
+    "gpt-oss-20b-MXFP4-Q8": ModelCard(
+        short_id="gpt-oss-20b-MXFP4-Q8",
+        model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
+        name="GPT-OSS 20B (MXFP4-Q8, MLX)",
+        description="""OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this variant is a 4-bit MLX conversion for Apple Silicon.""",
        tags=[],
        metadata=ModelMetadata(
-            model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q4"),
-            pretty_name="GPT-OSS 20B (MXFP4-Q4, MLX)",
+            model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
+            pretty_name="GPT-OSS 20B (MXFP4-Q8, MLX)",
            storage_size=Memory.from_kb(11_744_051),
            n_layers=24,
            hidden_size=2880,
            supports_tensor=True,
        ),
    ),
-    # Needs to be quantized g32 or g16.
+    # glm 4.5
    "glm-4.5-air-8bit": ModelCard(
+        # Needs to be quantized g32 or g16 to work with tensor parallel
        short_id="glm-4.5-air-8bit",
        model_id=ModelId("mlx-community/GLM-4.5-Air-8bit"),
        name="GLM 4.5 Air 8bit",
@@ -556,6 +472,7 @@ MODEL_CARDS: dict[str, ModelCard] = {
            supports_tensor=True,
        ),
    ),
+    # glm 4.7
    "glm-4.7-4bit": ModelCard(
        short_id="glm-4.7-4bit",
        model_id=ModelId("mlx-community/GLM-4.7-4bit"),
@@ -601,6 +518,7 @@ MODEL_CARDS: dict[str, ModelCard] = {
            supports_tensor=True,
        ),
    ),
+    # minimax-m2
    "minimax-m2.1-8bit": ModelCard(
        short_id="minimax-m2.1-8bit",
        model_id=ModelId("mlx-community/MiniMax-M2.1-8bit"),
@@ -631,19 +549,4 @@ MODEL_CARDS: dict[str, ModelCard] = {
            supports_tensor=True,
        ),
    ),
-    # "devstral-2-123b-instruct-2512-8bit": ModelCard(
-    #     short_id="devstral-2-123b-instruct-2512-8bit",
-    #     model_id=ModelId("mlx-community/Devstral-2-123B-Instruct-2512-8bit"),
-    #     name="Devstral 2 123B Instruct 2512 (8-bit, MLX)",
-    #     description="""Mistral AI's Devstral 2 123B Instruct (2512) is an agentic coding model.""",
-    #     tags=[],
-    #     metadata=ModelMetadata(
-    #         model_id=ModelId("mlx-community/Devstral-2-123B-Instruct-2512-8bit"),
-    #         pretty_name="Devstral 2 123B Instruct 2512 (8-bit, MLX)",
-    #         storage_size=Memory.from_kb(133_000_000),
-    #         n_layers=88,
-    #         hidden_size=12288,
-    #         supports_tensor=True,
-    #     ),
-    # ),
 }
--- a/src/exo/shared/types/api.py
+++ b/src/exo/shared/types/api.py
@@ -146,6 +146,7 @@ class ChatCompletionTaskParams(BaseModel):
    stream: bool = False
    temperature: float | None = None
    top_p: float | None = None
+    top_k: int | None = None
    tools: list[dict[str, Any]] | None = None
    tool_choice: str | dict[str, Any] | None = None
    parallel_tool_calls: bool | None = None
--- a/src/exo/shared/types/chunks.py
+++ b/src/exo/shared/types/chunks.py
@@ -1,6 +1,6 @@
 from enum import Enum

-from exo.shared.types.api import GenerationStats
+from exo.shared.types.api import GenerationStats, TopLogprobItem
 from exo.utils.pydantic_ext import TaggedModel

 from .api import FinishReason
@@ -20,6 +20,8 @@ class BaseChunk(TaggedModel):
 class TokenChunk(BaseChunk):
    text: str
    token_id: int
+    logprob: float | None = None  # Log probability of the selected token
+    top_logprobs: list[TopLogprobItem] | None = None  # Top-k alternative tokens
    finish_reason: FinishReason | None = None
    stats: GenerationStats | None = None

--- a/src/exo/shared/types/claude_api.py
+++ b/src/exo/shared/types/claude_api.py
@@ -0,0 +1,168 @@
+"""Claude Messages API types for request/response conversion."""
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+# Type aliases
+ClaudeRole = Literal["user", "assistant"]
+ClaudeStopReason = Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"]
+
+
+# Content block types
+class ClaudeTextBlock(BaseModel, frozen=True):
+    """Text content block in Claude Messages API."""
+
+    type: Literal["text"] = "text"
+    text: str
+
+
+class ClaudeImageSource(BaseModel, frozen=True):
+    """Image source for Claude image blocks."""
+
+    type: Literal["base64", "url"]
+    media_type: str | None = None
+    data: str | None = None
+    url: str | None = None
+
+
+class ClaudeImageBlock(BaseModel, frozen=True):
+    """Image content block in Claude Messages API."""
+
+    type: Literal["image"] = "image"
+    source: ClaudeImageSource
+
+
+ClaudeContentBlock = ClaudeTextBlock | ClaudeImageBlock
+
+
+# Request types
+class ClaudeMessage(BaseModel, frozen=True):
+    """Message in Claude Messages API request."""
+
+    role: ClaudeRole
+    content: str | list[ClaudeContentBlock]
+
+
+class ClaudeMessagesRequest(BaseModel):
+    """Request body for Claude Messages API."""
+
+    model: str
+    max_tokens: int
+    messages: list[ClaudeMessage]
+    system: str | list[ClaudeTextBlock] | None = None
+    stop_sequences: list[str] | None = None
+    stream: bool = False
+    temperature: float | None = None
+    top_p: float | None = None
+    top_k: int | None = None
+    metadata: dict[str, str] | None = None
+
+
+# Response types
+class ClaudeUsage(BaseModel, frozen=True):
+    """Token usage in Claude Messages API response."""
+
+    input_tokens: int
+    output_tokens: int
+
+
+class ClaudeMessagesResponse(BaseModel, frozen=True):
+    """Response body for Claude Messages API."""
+
+    id: str
+    type: Literal["message"] = "message"
+    role: Literal["assistant"] = "assistant"
+    content: list[ClaudeTextBlock]
+    model: str
+    stop_reason: ClaudeStopReason | None = None
+    stop_sequence: str | None = None
+    usage: ClaudeUsage
+
+
+# Streaming event types
+class ClaudeMessageStart(BaseModel, frozen=True):
+    """Partial message in message_start event."""
+
+    id: str
+    type: Literal["message"] = "message"
+    role: Literal["assistant"] = "assistant"
+    content: list[ClaudeTextBlock] = Field(default_factory=list)
+    model: str
+    stop_reason: ClaudeStopReason | None = None
+    stop_sequence: str | None = None
+    usage: ClaudeUsage
+
+
+class ClaudeMessageStartEvent(BaseModel, frozen=True):
+    """Event sent at start of message stream."""
+
+    type: Literal["message_start"] = "message_start"
+    message: ClaudeMessageStart
+
+
+class ClaudeContentBlockStartEvent(BaseModel, frozen=True):
+    """Event sent at start of a content block."""
+
+    type: Literal["content_block_start"] = "content_block_start"
+    index: int
+    content_block: ClaudeTextBlock
+
+
+class ClaudeTextDelta(BaseModel, frozen=True):
+    """Delta for text content block."""
+
+    type: Literal["text_delta"] = "text_delta"
+    text: str
+
+
+class ClaudeContentBlockDeltaEvent(BaseModel, frozen=True):
+    """Event sent for content block delta."""
+
+    type: Literal["content_block_delta"] = "content_block_delta"
+    index: int
+    delta: ClaudeTextDelta
+
+
+class ClaudeContentBlockStopEvent(BaseModel, frozen=True):
+    """Event sent at end of a content block."""
+
+    type: Literal["content_block_stop"] = "content_block_stop"
+    index: int
+
+
+class ClaudeMessageDeltaUsage(BaseModel, frozen=True):
+    """Usage in message_delta event."""
+
+    output_tokens: int
+
+
+class ClaudeMessageDelta(BaseModel, frozen=True):
+    """Delta in message_delta event."""
+
+    stop_reason: ClaudeStopReason | None = None
+    stop_sequence: str | None = None
+
+
+class ClaudeMessageDeltaEvent(BaseModel, frozen=True):
+    """Event sent with final message delta."""
+
+    type: Literal["message_delta"] = "message_delta"
+    delta: ClaudeMessageDelta
+    usage: ClaudeMessageDeltaUsage
+
+
+class ClaudeMessageStopEvent(BaseModel, frozen=True):
+    """Event sent at end of message stream."""
+
+    type: Literal["message_stop"] = "message_stop"
+
+
+ClaudeStreamEvent = (
+    ClaudeMessageStartEvent
+    | ClaudeContentBlockStartEvent
+    | ClaudeContentBlockDeltaEvent
+    | ClaudeContentBlockStopEvent
+    | ClaudeMessageDeltaEvent
+    | ClaudeMessageStopEvent
+)
--- a/src/exo/shared/types/events.py
+++ b/src/exo/shared/types/events.py
@@ -106,6 +106,12 @@ class ChunkGenerated(BaseEvent):
    chunk: GenerationChunk


+class PrefillProgress(BaseEvent):
+    command_id: CommandId
+    processed_tokens: int
+    total_tokens: int
+
+
 class TopologyEdgeCreated(BaseEvent):
    edge: Connection

@@ -131,6 +137,7 @@ Event = (
    | NodeMemoryMeasured
    | NodeDownloadProgress
    | ChunkGenerated
+    | PrefillProgress
    | TopologyEdgeCreated
    | TopologyEdgeDeleted
 )
--- a/src/exo/shared/types/openai_responses.py
+++ b/src/exo/shared/types/openai_responses.py
@@ -0,0 +1,162 @@
+"""OpenAI Responses API types for request/response conversion."""
+
+import time
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+# Type aliases
+ResponseStatus = Literal["completed", "failed", "in_progress", "incomplete"]
+ResponseRole = Literal["user", "assistant", "system", "developer"]
+
+
+# Request types
+class ResponseInputMessage(BaseModel, frozen=True):
+    """Input message for Responses API."""
+
+    role: ResponseRole
+    content: str
+
+
+class ResponsesRequest(BaseModel):
+    """Request body for OpenAI Responses API."""
+
+    model: str
+    input: str | list[ResponseInputMessage]
+    instructions: str | None = None
+    max_output_tokens: int | None = None
+    temperature: float | None = None
+    top_p: float | None = None
+    stream: bool = False
+    # previous_response_id not supported in MVP
+    metadata: dict[str, str] | None = None
+
+
+# Response types
+class ResponseOutputText(BaseModel, frozen=True):
+    """Text content in response output."""
+
+    type: Literal["output_text"] = "output_text"
+    text: str
+    annotations: list[dict[str, str]] = Field(default_factory=list)
+
+
+class ResponseMessageItem(BaseModel, frozen=True):
+    """Message item in response output array."""
+
+    type: Literal["message"] = "message"
+    id: str
+    role: Literal["assistant"] = "assistant"
+    content: list[ResponseOutputText]
+    status: ResponseStatus = "completed"
+
+
+ResponseItem = ResponseMessageItem  # Can expand for function_call, reasoning, etc.
+
+
+class ResponseUsage(BaseModel, frozen=True):
+    """Token usage in Responses API response."""
+
+    input_tokens: int
+    output_tokens: int
+    total_tokens: int
+
+
+class ResponsesResponse(BaseModel, frozen=True):
+    """Response body for OpenAI Responses API."""
+
+    id: str
+    object: Literal["response"] = "response"
+    created_at: int = Field(default_factory=lambda: int(time.time()))
+    status: ResponseStatus = "completed"
+    model: str
+    output: list[ResponseItem]
+    output_text: str
+    usage: ResponseUsage | None = None
+
+
+# Streaming event types
+class ResponseCreatedEvent(BaseModel, frozen=True):
+    """Event sent when response is created."""
+
+    type: Literal["response.created"] = "response.created"
+    response: ResponsesResponse
+
+
+class ResponseInProgressEvent(BaseModel, frozen=True):
+    """Event sent when response starts processing."""
+
+    type: Literal["response.in_progress"] = "response.in_progress"
+    response: ResponsesResponse
+
+
+class ResponseOutputItemAddedEvent(BaseModel, frozen=True):
+    """Event sent when an output item is added."""
+
+    type: Literal["response.output_item.added"] = "response.output_item.added"
+    output_index: int
+    item: ResponseItem
+
+
+class ResponseContentPartAddedEvent(BaseModel, frozen=True):
+    """Event sent when a content part is added."""
+
+    type: Literal["response.content_part.added"] = "response.content_part.added"
+    output_index: int
+    content_index: int
+    part: ResponseOutputText
+
+
+class ResponseTextDeltaEvent(BaseModel, frozen=True):
+    """Event sent for text delta during streaming."""
+
+    type: Literal["response.output_text.delta"] = "response.output_text.delta"
+    output_index: int
+    content_index: int
+    delta: str
+
+
+class ResponseTextDoneEvent(BaseModel, frozen=True):
+    """Event sent when text content is done."""
+
+    type: Literal["response.output_text.done"] = "response.output_text.done"
+    output_index: int
+    content_index: int
+    text: str
+
+
+class ResponseContentPartDoneEvent(BaseModel, frozen=True):
+    """Event sent when a content part is done."""
+
+    type: Literal["response.content_part.done"] = "response.content_part.done"
+    output_index: int
+    content_index: int
+    part: ResponseOutputText
+
+
+class ResponseOutputItemDoneEvent(BaseModel, frozen=True):
+    """Event sent when an output item is done."""
+
+    type: Literal["response.output_item.done"] = "response.output_item.done"
+    output_index: int
+    item: ResponseItem
+
+
+class ResponseCompletedEvent(BaseModel, frozen=True):
+    """Event sent when response is completed."""
+
+    type: Literal["response.completed"] = "response.completed"
+    response: ResponsesResponse
+
+
+ResponsesStreamEvent = (
+    ResponseCreatedEvent
+    | ResponseInProgressEvent
+    | ResponseOutputItemAddedEvent
+    | ResponseContentPartAddedEvent
+    | ResponseTextDeltaEvent
+    | ResponseTextDoneEvent
+    | ResponseContentPartDoneEvent
+    | ResponseOutputItemDoneEvent
+    | ResponseCompletedEvent
+)
--- a/src/exo/shared/types/worker/runner_response.py
+++ b/src/exo/shared/types/worker/runner_response.py
@@ -1,4 +1,4 @@
-from exo.shared.types.api import FinishReason, GenerationStats
+from exo.shared.types.api import FinishReason, GenerationStats, TopLogprobItem
 from exo.utils.pydantic_ext import TaggedModel


@@ -13,10 +13,16 @@ class TokenizedResponse(BaseRunnerResponse):
 class GenerationResponse(BaseRunnerResponse):
    text: str
    token: int
-    # logprobs: list[float] | None = None # too big. we can change to be top-k
+    logprob: float | None = None  # Log probability of the selected token
+    top_logprobs: list[TopLogprobItem] | None = None  # Top-k alternative tokens
    finish_reason: FinishReason | None = None
    stats: GenerationStats | None = None


 class FinishedResponse(BaseRunnerResponse):
    pass
+
+
+class PrefillProgressResponse(BaseRunnerResponse):
+    processed_tokens: int
+    total_tokens: int
--- a/src/exo/worker/engines/mlx/generator/generate.py
+++ b/src/exo/worker/engines/mlx/generator/generate.py
@@ -12,6 +12,7 @@ from exo.shared.types.api import (
    ChatCompletionMessage,
    FinishReason,
    GenerationStats,
+    TopLogprobItem,
 )
 from exo.shared.types.memory import Memory
 from exo.shared.types.tasks import ChatCompletionTaskParams
@@ -81,7 +82,7 @@ def warmup_inference(
        max_tokens=50,
        sampler=sampler,
        prompt_cache=cache,
-        prefill_step_size=2048,
+        prefill_step_size=256,  # Temporarily reduced from 2048 for testing progress bar
        kv_group_size=KV_GROUP_SIZE,
        kv_bits=KV_BITS,
    ):
@@ -115,10 +116,65 @@ def eos_ids_from_tokenizer(tokenizer: TokenizerWrapper) -> list[int]:
    return eos


+def extract_top_logprobs(
+    logprobs: mx.array,
+    tokenizer: TokenizerWrapper,
+    top_k: int,
+    selected_token: int,
+) -> tuple[float, list[TopLogprobItem]]:
+    """Extract the selected token's logprob and top-k alternative tokens.
+
+    Args:
+        logprobs: Full vocabulary logprobs array from MLX
+        tokenizer: Tokenizer for decoding token IDs to strings
+        top_k: Number of top alternatives to return
+        selected_token: The token ID that was actually sampled
+
+    Returns:
+        Tuple of (selected_token_logprob, list of TopLogprobItem for top-k tokens)
+    """
+    # Get the logprob of the selected token
+    selected_logprob = float(logprobs[selected_token].item())
+
+    # Get top-k indices (most probable tokens)
+    # mx.argpartition gives indices that would partition the array
+    # We negate logprobs since argpartition finds smallest, and we want largest
+    top_k = min(top_k, logprobs.shape[0])  # Don't exceed vocab size
+    top_indices = mx.argpartition(-logprobs, top_k)[:top_k]
+
+    # Get the actual logprob values for these indices
+    top_values = logprobs[top_indices]
+
+    # Sort by logprob (descending) for consistent ordering
+    sort_order = mx.argsort(-top_values)
+    top_indices = top_indices[sort_order]
+    top_values = top_values[sort_order]
+
+    # Convert to list of TopLogprobItem
+    top_logprob_items: list[TopLogprobItem] = []
+    for i in range(top_k):
+        token_id = int(top_indices[i].item())
+        token_logprob = float(top_values[i].item())
+        # Decode token ID to string
+        token_str = tokenizer.decode([token_id])
+        # Get byte representation
+        token_bytes = list(token_str.encode("utf-8"))
+        top_logprob_items.append(
+            TopLogprobItem(
+                token=token_str,
+                logprob=token_logprob,
+                bytes=token_bytes,
+            )
+        )
+
+    return selected_logprob, top_logprob_items
+
+
 def mlx_generate(
    model: Model,
    tokenizer: TokenizerWrapper,
    task: ChatCompletionTaskParams,
+    on_prefill_progress: Callable[[int, int], None] | None = None,
 ) -> Generator[GenerationResponse]:
    # Ensure that generation stats only contains peak memory for this generation
    mx.reset_peak_memory()
@@ -146,9 +202,24 @@ def mlx_generate(
    sampler = make_sampler(
        temp=task.temperature if task.temperature is not None else 0.7,
        top_p=task.top_p if task.top_p is not None else 1.0,
+        top_k=task.top_k if task.top_k is not None else 0,
    )

+    # Normalize stop sequences to a list
+    stop_sequences: list[str] = (
+        ([task.stop] if isinstance(task.stop, str) else task.stop)
+        if task.stop is not None
+        else []
+    )
+    max_stop_len = max((len(s) for s in stop_sequences), default=0)
+
    max_tokens = task.max_tokens or MAX_TOKENS
+    accumulated_text = ""
+
+    # Determine if we need to extract logprobs
+    should_extract_logprobs = task.logprobs is True
+    num_top_logprobs = task.top_logprobs if task.top_logprobs is not None else 5
+
    for out in stream_generate(
        model=model,
        tokenizer=tokenizer,
@@ -158,14 +229,47 @@ def mlx_generate(
        logits_processors=logits_processors,
        prompt_cache=caches,
        # TODO: Dynamically change prefill step size to be the maximum possible without timing out.
-        prefill_step_size=2048,
+        prefill_step_size=256,  # Temporarily reduced from 2048 for testing progress bar
        kv_group_size=KV_GROUP_SIZE,
        kv_bits=KV_BITS,
+        prompt_progress_callback=on_prefill_progress,
    ):
        logger.info(out.text)
+        accumulated_text += out.text

+        # Check for stop sequences
+        text = out.text
+        finish_reason: FinishReason | None = cast(
+            FinishReason | None, out.finish_reason
+        )
+        stop_matched = False
+
+        if stop_sequences:
+            for stop_seq in stop_sequences:
+                if stop_seq in accumulated_text:
+                    # Trim text to just before the stop sequence
+                    stop_index = accumulated_text.find(stop_seq)
+                    text_before_stop = accumulated_text[:stop_index]
+                    chunk_start = len(accumulated_text) - len(out.text)
+                    text = text_before_stop[chunk_start:]
+                    finish_reason = "stop"
+                    stop_matched = True
+                    break
+
+        # Extract logprobs if requested
+        token_logprob: float | None = None
+        top_logprobs: list[TopLogprobItem] | None = None
+        if should_extract_logprobs:
+            token_logprob, top_logprobs = extract_top_logprobs(
+                logprobs=out.logprobs,
+                tokenizer=tokenizer,
+                top_k=num_top_logprobs,
+                selected_token=out.token,
+            )
+
+        is_done = finish_reason is not None
        stats: GenerationStats | None = None
-        if out.finish_reason is not None:
+        if is_done:
            stats = GenerationStats(
                prompt_tps=float(out.prompt_tps),
                generation_tps=float(out.generation_tps),
@@ -173,22 +277,25 @@ def mlx_generate(
                generation_tokens=int(out.generation_tokens),
                peak_memory_usage=Memory.from_gb(out.peak_memory),
            )
-
-            if out.finish_reason not in get_args(FinishReason):
-                # We don't throw here as this failure case is really not all that bad
-                # Just log the error and move on
+            if not stop_matched and out.finish_reason not in get_args(FinishReason):
                logger.warning(
                    f"Model generated unexpected finish_reason: {out.finish_reason}"
                )

        yield GenerationResponse(
-            text=out.text,
+            text=text,
            token=out.token,
-            finish_reason=cast(FinishReason | None, out.finish_reason),
+            logprob=token_logprob,
+            top_logprobs=top_logprobs,
+            finish_reason=finish_reason,
            stats=stats,
        )

-        if out.finish_reason is not None:
+        if is_done:
            break

+        # Limit accumulated_text to what's needed for stop sequence detection
+        if max_stop_len > 0 and len(accumulated_text) > max_stop_len:
+            accumulated_text = accumulated_text[-max_stop_len:]
+
        # TODO: Do we want an mx_barrier?
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -20,6 +20,7 @@ except ImportError:

 from mlx_lm.models.cache import KVCache, QuantizedKVCache, RotatingKVCache
 from mlx_lm.models.deepseek_v3 import DeepseekV3Model
+from mlx_lm.models.gpt_oss import Model as GptOssModel
 from mlx_lm.tokenizer_utils import TokenizerWrapper

 from exo.worker.engines.mlx.constants import (
@@ -365,6 +366,8 @@ def apply_chat_template(
        tools=chat_task_data.tools,
    )

+    logger.info(prompt)
+
    return prompt


@@ -396,6 +399,11 @@ def make_kv_cache(
 ) -> list[KVCache | RotatingKVCache | QuantizedKVCache]:
    assert hasattr(model, "layers")

+    # TODO: Do this for all models
+    if hasattr(model, "make_cache") and isinstance(model, GptOssModel):
+        logger.info("Using MLX LM's make cache")
+        return model.make_cache()  # type: ignore
+
    if max_kv_size is None:
        if KV_CACHE_BITS is None:
            logger.info("Using default KV cache")
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -1,12 +1,22 @@
 import time
+from collections.abc import Generator
+from functools import cache

 import mlx.core as mx
+from mlx_lm.models.gpt_oss import Model as GptOssModel
+from openai_harmony import (  # pyright: ignore[reportMissingTypeStubs]
+    HarmonyEncodingName,
+    Role,
+    StreamableParser,
+    load_harmony_encoding,
+)

 from exo.shared.types.api import ChatCompletionMessageText
 from exo.shared.types.chunks import TokenChunk
 from exo.shared.types.events import (
    ChunkGenerated,
    Event,
+    PrefillProgress,
    RunnerStatusUpdated,
    TaskAcknowledged,
    TaskStatusUpdated,
@@ -152,12 +162,32 @@ def main(
                    assert task_params.messages[0].content is not None
                    _check_for_debug_prompts(task_params.messages[0].content)

+                    # Define callback to send prefill progress events directly
+                    def on_prefill_progress(processed: int, total: int) -> None:
+                        if shard_metadata.device_rank == 0:
+                            event_sender.send(
+                                PrefillProgress(
+                                    command_id=command_id,
+                                    processed_tokens=processed,
+                                    total_tokens=total,
+                                )
+                            )
+
                    # Generate responses using the actual MLX generation
-                    for response in mlx_generate(
+                    mlx_generator = mlx_generate(
                        model=model,
                        tokenizer=tokenizer,
                        task=task_params,
-                    ):
+                        on_prefill_progress=on_prefill_progress,
+                    )
+
+                    # GPT-OSS specific parsing to match other model formats.
+                    if isinstance(model, GptOssModel):
+                        mlx_generator = parse_gpt_oss(mlx_generator)
+
+                    # TODO: Add tool call parser here
+
+                    for response in mlx_generator:
                        match response:
                            case GenerationResponse():
                                if shard_metadata.device_rank == 0:
@@ -169,6 +199,8 @@ def main(
                                                model=shard_metadata.model_meta.model_id,
                                                text=response.text,
                                                token_id=response.token,
+                                                logprob=response.logprob,
+                                                top_logprobs=response.top_logprobs,
                                                finish_reason=response.finish_reason,
                                                stats=response.stats,
                                            ),
@@ -207,6 +239,43 @@ def main(
                break


+@cache
+def get_gpt_oss_encoding():
+    encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    return encoding
+
+
+def parse_gpt_oss(
+    responses: Generator[GenerationResponse],
+) -> Generator[GenerationResponse]:
+    encoding = get_gpt_oss_encoding()
+    stream = StreamableParser(encoding, role=Role.ASSISTANT)
+    thinking = False
+
+    for response in responses:
+        stream.process(response.token)
+
+        delta = stream.last_content_delta
+        ch = stream.current_channel
+
+        if ch == "analysis" and not thinking:
+            thinking = True
+            yield response.model_copy(update={"text": "<think>"})
+
+        if ch != "analysis" and thinking:
+            thinking = False
+            yield response.model_copy(update={"text": "</think>"})
+
+        if delta:
+            yield response.model_copy(update={"text": delta})
+
+        if response.finish_reason is not None:
+            if thinking:
+                yield response.model_copy(update={"text": "</think>"})
+            yield response
+            break
+
+
 EXO_RUNNER_MUST_FAIL = "EXO RUNNER MUST FAIL"
 EXO_RUNNER_MUST_OOM = "EXO RUNNER MUST OOM"
 EXO_RUNNER_MUST_TIMEOUT = "EXO RUNNER MUST TIMEOUT"
--- a/src/exo/worker/utils/net_profile.py
+++ b/src/exo/worker/utils/net_profile.py
@@ -1,4 +1,5 @@
 import http.client
+import time

 from anyio import create_task_group, to_thread
 from loguru import logger
@@ -6,6 +7,8 @@ from loguru import logger
 from exo.shared.topology import Topology
 from exo.shared.types.common import NodeId

+BAD_STATUSLINE_ATTEMPTS = 3
+

 async def check_reachability(
    target_ip: str,
@@ -15,8 +18,9 @@ async def check_reachability(
 ) -> None:
    """Check if a node is reachable at the given IP and verify its identity."""

-    def _fetch_remote_node_id() -> NodeId | None:
-        connection = http.client.HTTPConnection(target_ip, 52415, timeout=1)
+    # TODO: use an async http client
+    def _fetch_remote_node_id(*, attempt: int = 1) -> NodeId | None:
+        connection = http.client.HTTPConnection(target_ip, 52415, timeout=3)
        try:
            connection.request("GET", "/node_id")
            response = connection.getresponse()
@@ -32,7 +36,16 @@ async def check_reachability(
            return NodeId(body) or None
        except OSError:
            return None
-        except http.client.HTTPException:
+        except http.client.BadStatusLine:
+            if attempt >= BAD_STATUSLINE_ATTEMPTS:
+                logger.warning(
+                    f"BadStatusLine from {target_ip}, after {attempt} attempts, assuming connection to {expected_node_id} has dropped"
+                )
+                return None
+            time.sleep(1)
+            return _fetch_remote_node_id(attempt=attempt + 1)
+        except http.client.HTTPException as e:
+            logger.warning(f"HTTPException from {target_ip}: {type(e).__name__}: {e}")
            return None
        finally:
            connection.close()
Author	SHA1	Message	Date
Alex Cheema	6eb8f9d9f5	feat: add prefill progress bar for long prompts Shows real-time progress during prompt processing (prefill phase). Progress is sent via SSE named events that maintain OpenAI API compatibility. - Add PrefillProgress event type - Wire prompt_progress_callback through MLX stream_generate - Send progress events directly from callback for real-time updates - Add PrefillProgressBar.svelte component - Parse event: prefill_progress SSE events in dashboard Note: prefill_step_size temporarily set to 256 for testing (normally 2048) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-17 17:12:59 +00:00
Alex Cheema	663a0faaeb	feat: add uncertainty visualization with token-level logprobs Wire log probabilities from MLX through the API to enable uncertainty visualization in the dashboard: Backend: - Extract top-k logprobs from MLX stream_generate output - Add logprob and top_logprobs fields to GenerationResponse and TokenChunk - Populate Logprobs in streaming API response when requested Dashboard: - Add TokenHeatmap component with color-coded token confidence - Parse logprobs from SSE responses and store on messages - Add toggle button to switch between normal and uncertainty view - Hover tooltip shows exact probability and top-5 alternatives Color scheme: - Green (>80%): High confidence - Yellow (50-80%): Medium confidence - Orange (20-50%): Low confidence - Red (<20%): Very low confidence Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-17 16:45:23 +00:00
Alex Cheema	0a58aa73ec	feat: add Claude Messages API and OpenAI Responses API support Adds two new API endpoints that wrap the existing chat completions: - /v1/messages - Claude Messages API compatible endpoint - /v1/responses - OpenAI Responses API compatible endpoint Both support streaming (SSE) and non-streaming modes with proper token usage reporting from actual inference stats. Also adds top_k sampling parameter and stop sequence support to the MLX inference engine. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-16 15:14:21 +00:00
Sami Khan	6e6567a802	resolve issue #1070 (#1076 ) ## Motivation https://github.com/exo-explore/exo/issues/1070 ## Changes Added check in ChatForm.svelte to reset selectedChatModel when it no longer matches any running instance. ## Why It Works The $effect now detects when the selected model is stale (not in availableModels()) and resets to the first available model. ## Test Plan ### Manual Testing 1. Create instance of Model A → Delete it → Create instance of Model B → Chat 2. Verify request goes to Model B (not Model A) --------- Co-authored-by: Alex Cheema <41707476+AlexCheema@users.noreply.github.com>	2026-01-15 20:00:41 +00:00
rltakashige	a735dad667	Parse GPT OSS in runner (#1160 ) ## Motivation Simplification of API + moving model specific code to the runner <!-- Why is this change needed? What problem does it solve? --> <!-- If it fixes an open issue, please link to the issue here --> ## Test Plan ### Manual Testing Tested that GPT OSS outputs are parsed correctly on the dashboard. ### Automated Testing <!-- Describe changes to automated tests, or how existing tests cover this change --> <!-- - -->	2026-01-15 19:53:55 +00:00
rltakashige	aaf4e36bc3	FIX GPT OSS (#1165 ) ## Motivation Adds several unmerged fixes for GPT OSS. Also adds GPT OSS 20B MXFP4 Q8 instead of Q4 for numerical stability (as this is unstable for MLX LM too) <!-- Why is this change needed? What problem does it solve? --> <!-- If it fixes an open issue, please link to the issue here --> ## Test Plan ### Manual Testing Manually tested. No further gibberish responses. ### Automated Testing Ran EXO Bench - pipeline, tensor and single node work on both 20B and 120B models	2026-01-15 19:20:17 +00:00
Evan Quiney	3e623ccf0d	up http timeout to 3 seconds and retry on BadStatusLine (#1164 ) we're seeing a lot of network churn - perhaps this is a connection timing out issue? lets also re-try after a second ## testing none yet --------- Co-authored-by: Alex Cheema <alexcheema123@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-15 18:15:12 +00:00
Evan Quiney	c22dad8a7d	dashboard: add peer: true to package lock (#1162 ) this happens every time i run npm install - lets upstream it ## testing dashboard builds and renders	2026-01-15 17:01:43 +00:00
Evan	4bc4d50685	rust: remove dead code the system custodian has been made unnecessary with the swift app - we can remove it ## testing everything still builds	2026-01-15 16:51:46 +00:00
Jake Hillion	e0aab46fd8	model_cards.py: clean up commented out code Clean up the commented out code and make sure the comments are unified. Carrying around the commented out code means people making changes to model_cards are supposed to update it, but that's not clear and won't be picked up by type checking etc. Drop it for now - it's in the git history. Also make the rest of the comments a bit more uniform, and place comments about a specific model card inside the model card (instead of above) so they don't get lost when code is added/moved around. Test plan: - my eyes	2026-01-15 13:21:58 +00:00
				`@@ -0,0 +1 @@`
				`"""API adapters for different API formats (Claude, OpenAI Responses, etc.)."""`