Test LRU eviction

Remove incorrect typing
Add tests
2026-01-23 21:41:21 -05:00 · 2026-01-23 20:43:51 +00:00 · 2026-01-23 20:36:50 +00:00 · 2026-01-23 19:50:36 +00:00 · 2026-01-23 19:50:30 +00:00 · 2026-01-23 16:32:58 +00:00
35 changed files with 1122 additions and 975 deletions
--- a/dashboard/src/lib/stores/app.svelte.ts
+++ b/dashboard/src/lib/stores/app.svelte.ts
@@ -216,8 +216,6 @@ export interface Message {
  attachments?: MessageAttachment[];
  ttftMs?: number; // Time to first token in ms (for assistant messages)
  tps?: number; // Tokens per second (for assistant messages)
-  requestType?: "chat" | "image-generation" | "image-editing";
-  sourceImageDataUrl?: string; // For image editing regeneration
 }

 export interface Conversation {
@@ -1272,46 +1270,10 @@ class AppStore {

    if (lastUserIndex === -1) return;

-    const lastUserMessage = this.messages[lastUserIndex];
-    const requestType = lastUserMessage.requestType || "chat";
-    const prompt = lastUserMessage.content;
+    // Remove any messages after the user message
+    this.messages = this.messages.slice(0, lastUserIndex + 1);

-    // Remove messages after user message (including the user message for image requests
-    // since generateImage/editImage will re-add it)
-    this.messages = this.messages.slice(0, lastUserIndex);
-
-    switch (requestType) {
-      case "image-generation":
-        await this.generateImage(prompt);
-        break;
-      case "image-editing":
-        if (lastUserMessage.sourceImageDataUrl) {
-          await this.editImage(prompt, lastUserMessage.sourceImageDataUrl);
-        } else {
-          // Can't regenerate edit without source image - restore user message and show error
-          this.messages.push(lastUserMessage);
-          const errorMessage = this.addMessage("assistant", "");
-          const idx = this.messages.findIndex((m) => m.id === errorMessage.id);
-          if (idx !== -1) {
-            this.messages[idx].content =
-              "Error: Cannot regenerate image edit - source image not found";
-          }
-          this.updateActiveConversation();
-        }
-        break;
-      case "chat":
-      default:
-        // Restore the user message for chat regeneration
-        this.messages.push(lastUserMessage);
-        await this.regenerateChatCompletion();
-        break;
-    }
-  }
-
-  /**
-   * Helper method to regenerate a chat completion response
-   */
-  private async regenerateChatCompletion(): Promise<void> {
+    // Resend the message to get a new response
    this.isLoading = true;
    this.currentResponse = "";

@@ -1826,7 +1788,6 @@ class AppStore {
      role: "user",
      content: prompt,
      timestamp: Date.now(),
-      requestType: "image-generation",
    };
    this.messages.push(userMessage);

@@ -2037,8 +1998,6 @@ class AppStore {
      role: "user",
      content: prompt,
      timestamp: Date.now(),
-      requestType: "image-editing",
-      sourceImageDataUrl: imageDataUrl,
    };
    this.messages.push(userMessage);

@@ -2228,54 +2187,6 @@ class AppStore {
      this.conversations.find((c) => c.id === this.activeConversationId) || null
    );
  }
-
-  /**
-   * Start a download on a specific node
-   */
-  async startDownload(nodeId: string, shardMetadata: object): Promise<void> {
-    try {
-      const response = await fetch("/download/start", {
-        method: "POST",
-        headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({
-          targetNodeId: nodeId,
-          shardMetadata: shardMetadata,
-        }),
-      });
-      if (!response.ok) {
-        const errorText = await response.text();
-        throw new Error(
-          `Failed to start download: ${response.status} - ${errorText}`,
-        );
-      }
-    } catch (error) {
-      console.error("Error starting download:", error);
-      throw error;
-    }
-  }
-
-  /**
-   * Delete a downloaded model from a specific node
-   */
-  async deleteDownload(nodeId: string, modelId: string): Promise<void> {
-    try {
-      const response = await fetch(
-        `/download/${encodeURIComponent(nodeId)}/${encodeURIComponent(modelId)}`,
-        {
-          method: "DELETE",
-        },
-      );
-      if (!response.ok) {
-        const errorText = await response.text();
-        throw new Error(
-          `Failed to delete download: ${response.status} - ${errorText}`,
-        );
-      }
-    } catch (error) {
-      console.error("Error deleting download:", error);
-      throw error;
-    }
-  }
 }

 export const appStore = new AppStore();
@@ -2381,9 +2292,3 @@ export const setImageGenerationParams = (
 ) => appStore.setImageGenerationParams(params);
 export const resetImageGenerationParams = () =>
  appStore.resetImageGenerationParams();
-
-// Download actions
-export const startDownload = (nodeId: string, shardMetadata: object) =>
-  appStore.startDownload(nodeId, shardMetadata);
-export const deleteDownload = (nodeId: string, modelId: string) =>
-  appStore.deleteDownload(nodeId, modelId);
--- a/dashboard/src/routes/downloads/+page.svelte
+++ b/dashboard/src/routes/downloads/+page.svelte
@@ -6,8 +6,6 @@
    type DownloadProgress,
    refreshState,
    lastUpdate as lastUpdateStore,
-    startDownload,
-    deleteDownload,
  } from "$lib/stores/app.svelte";
  import HeaderNav from "$lib/components/HeaderNav.svelte";

@@ -30,7 +28,6 @@
    etaMs: number;
    status: "completed" | "downloading";
    files: FileProgress[];
-    shardMetadata?: Record<string, unknown>;
  };

  type NodeEntry = {
@@ -272,12 +269,6 @@
            }
          }

-          // Extract shard_metadata for use with download actions
-          const shardMetadata = (downloadPayload.shard_metadata ??
-            downloadPayload.shardMetadata) as
-            | Record<string, unknown>
-            | undefined;
-
          const entry: ModelEntry = {
            modelId,
            prettyName,
@@ -294,7 +285,6 @@
                ? "completed"
                : "downloading",
            files,
-            shardMetadata,
          };

          const existing = modelMap.get(modelId);
@@ -479,52 +469,6 @@
                    >
                      {pct.toFixed(1)}%
                    </span>
-                    {#if model.status !== "completed" && model.shardMetadata}
-                      <button
-                        type="button"
-                        class="text-exo-light-gray hover:text-exo-yellow transition-colors"
-                        onclick={() =>
-                          startDownload(node.nodeId, model.shardMetadata!)}
-                        title="Start download"
-                      >
-                        <svg
-                          class="w-4 h-4"
-                          viewBox="0 0 20 20"
-                          fill="none"
-                          stroke="currentColor"
-                          stroke-width="2"
-                        >
-                          <path
-                            d="M10 3v10m0 0l-3-3m3 3l3-3M3 17h14"
-                            stroke-linecap="round"
-                            stroke-linejoin="round"
-                          ></path>
-                        </svg>
-                      </button>
-                    {/if}
-                    {#if model.status === "completed"}
-                      <button
-                        type="button"
-                        class="text-exo-light-gray hover:text-red-400 transition-colors"
-                        onclick={() =>
-                          deleteDownload(node.nodeId, model.modelId)}
-                        title="Delete download"
-                      >
-                        <svg
-                          class="w-4 h-4"
-                          viewBox="0 0 20 20"
-                          fill="none"
-                          stroke="currentColor"
-                          stroke-width="2"
-                        >
-                          <path
-                            d="M4 6h12M8 6V4h4v2m1 0v10a1 1 0 01-1 1H8a1 1 0 01-1-1V6h6"
-                            stroke-linecap="round"
-                            stroke-linejoin="round"
-                          ></path>
-                        </svg>
-                      </button>
-                    {/if}
                    <button
                      type="button"
                      class="text-exo-light-gray hover:text-exo-yellow transition-colors"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
    "httpx>=0.28.1",
    "tomlkit>=0.14.0",
    "pillow>=11.0,<12.0", # compatibility with mflux
-    "mflux==0.15.4",
+    "mflux>=0.14.2",
    "python-multipart>=0.0.21",
 ]

--- a/src/exo/download/coordinator.py
+++ b/src/exo/download/coordinator.py
@@ -1,284 +0,0 @@
-import asyncio
-from dataclasses import dataclass, field
-from typing import Iterator
-
-import anyio
-from anyio import current_time
-from anyio.abc import TaskGroup
-from loguru import logger
-
-from exo.download.download_utils import (
-    RepoDownloadProgress,
-    delete_model,
-    map_repo_download_progress_to_download_progress_data,
-)
-from exo.download.shard_downloader import ShardDownloader
-from exo.shared.models.model_cards import ModelId
-from exo.shared.types.commands import (
-    DeleteDownload,
-    ForwarderDownloadCommand,
-    StartDownload,
-)
-from exo.shared.types.common import NodeId, SessionId
-from exo.shared.types.events import (
-    Event,
-    ForwarderEvent,
-    NodeDownloadProgress,
-)
-from exo.shared.types.worker.downloads import (
-    DownloadCompleted,
-    DownloadFailed,
-    DownloadOngoing,
-    DownloadPending,
-    DownloadProgress,
-)
-from exo.shared.types.worker.shards import ShardMetadata
-from exo.utils.channels import Receiver, Sender, channel
-
-
-@dataclass
-class DownloadCoordinator:
-    node_id: NodeId
-    session_id: SessionId
-    shard_downloader: ShardDownloader
-    download_command_receiver: Receiver[ForwarderDownloadCommand]
-    local_event_sender: Sender[ForwarderEvent]
-    event_index_counter: Iterator[int]
-
-    # Local state
-    download_status: dict[ModelId, DownloadProgress] = field(default_factory=dict)
-    active_downloads: dict[ModelId, asyncio.Task[None]] = field(default_factory=dict)
-
-    # Internal event channel for forwarding (initialized in __post_init__)
-    event_sender: Sender[Event] = field(init=False)
-    event_receiver: Receiver[Event] = field(init=False)
-    _tg: TaskGroup = field(init=False)
-
-    def __post_init__(self) -> None:
-        self.event_sender, self.event_receiver = channel[Event]()
-        self._tg = anyio.create_task_group()
-
-    async def run(self) -> None:
-        logger.info("Starting DownloadCoordinator")
-        async with self._tg as tg:
-            tg.start_soon(self._command_processor)
-            tg.start_soon(self._forward_events)
-            tg.start_soon(self._emit_existing_download_progress)
-
-    def shutdown(self) -> None:
-        self._tg.cancel_scope.cancel()
-
-    async def _command_processor(self) -> None:
-        with self.download_command_receiver as commands:
-            async for cmd in commands:
-                # Only process commands targeting this node
-                if cmd.command.target_node_id != self.node_id:
-                    continue
-
-                match cmd.command:
-                    case StartDownload(shard_metadata=shard):
-                        await self._start_download(shard)
-                    case DeleteDownload(model_id=model_id):
-                        await self._delete_download(model_id)
-
-    async def _start_download(self, shard: ShardMetadata) -> None:
-        model_id = shard.model_card.model_id
-
-        # Check if already downloading or complete
-        if model_id in self.download_status:
-            status = self.download_status[model_id]
-            if isinstance(status, (DownloadOngoing, DownloadCompleted)):
-                logger.debug(
-                    f"Download for {model_id} already in progress or complete, skipping"
-                )
-                return
-
-        # Emit pending status
-        progress = DownloadPending(shard_metadata=shard, node_id=self.node_id)
-        self.download_status[model_id] = progress
-        await self.event_sender.send(NodeDownloadProgress(download_progress=progress))
-
-        # Check initial status from downloader
-        initial_progress = (
-            await self.shard_downloader.get_shard_download_status_for_shard(shard)
-        )
-
-        if initial_progress.status == "complete":
-            completed = DownloadCompleted(
-                shard_metadata=shard,
-                node_id=self.node_id,
-                total_bytes=initial_progress.total_bytes,
-            )
-            self.download_status[model_id] = completed
-            await self.event_sender.send(
-                NodeDownloadProgress(download_progress=completed)
-            )
-            return
-
-        # Start actual download
-        self._start_download_task(shard, initial_progress)
-
-    def _start_download_task(
-        self, shard: ShardMetadata, initial_progress: RepoDownloadProgress
-    ) -> None:
-        model_id = shard.model_card.model_id
-
-        # Emit ongoing status
-        status = DownloadOngoing(
-            node_id=self.node_id,
-            shard_metadata=shard,
-            download_progress=map_repo_download_progress_to_download_progress_data(
-                initial_progress
-            ),
-        )
-        self.download_status[model_id] = status
-        self.event_sender.send_nowait(NodeDownloadProgress(download_progress=status))
-
-        last_progress_time = 0.0
-        throttle_interval_secs = 1.0
-
-        async def download_progress_callback(
-            callback_shard: ShardMetadata, progress: RepoDownloadProgress
-        ) -> None:
-            nonlocal last_progress_time
-
-            if progress.status == "complete":
-                completed = DownloadCompleted(
-                    shard_metadata=callback_shard,
-                    node_id=self.node_id,
-                    total_bytes=progress.total_bytes,
-                )
-                self.download_status[callback_shard.model_card.model_id] = completed
-                await self.event_sender.send(
-                    NodeDownloadProgress(download_progress=completed)
-                )
-                # Clean up active download tracking
-                if callback_shard.model_card.model_id in self.active_downloads:
-                    del self.active_downloads[callback_shard.model_card.model_id]
-            elif (
-                progress.status == "in_progress"
-                and current_time() - last_progress_time > throttle_interval_secs
-            ):
-                ongoing = DownloadOngoing(
-                    node_id=self.node_id,
-                    shard_metadata=callback_shard,
-                    download_progress=map_repo_download_progress_to_download_progress_data(
-                        progress
-                    ),
-                )
-                self.download_status[callback_shard.model_card.model_id] = ongoing
-                await self.event_sender.send(
-                    NodeDownloadProgress(download_progress=ongoing)
-                )
-                last_progress_time = current_time()
-
-        self.shard_downloader.on_progress(download_progress_callback)
-
-        async def download_wrapper() -> None:
-            try:
-                await self.shard_downloader.ensure_shard(shard)
-            except Exception as e:
-                logger.error(f"Download failed for {model_id}: {e}")
-                failed = DownloadFailed(
-                    shard_metadata=shard,
-                    node_id=self.node_id,
-                    error_message=str(e),
-                )
-                self.download_status[model_id] = failed
-                await self.event_sender.send(
-                    NodeDownloadProgress(download_progress=failed)
-                )
-            finally:
-                if model_id in self.active_downloads:
-                    del self.active_downloads[model_id]
-
-        task = asyncio.create_task(download_wrapper())
-        self.active_downloads[model_id] = task
-
-    async def _delete_download(self, model_id: ModelId) -> None:
-        # Cancel if active
-        if model_id in self.active_downloads:
-            logger.info(f"Cancelling active download for {model_id} before deletion")
-            self.active_downloads[model_id].cancel()
-            del self.active_downloads[model_id]
-
-        # Delete from disk
-        logger.info(f"Deleting model files for {model_id}")
-        deleted = await delete_model(model_id)
-
-        if deleted:
-            logger.info(f"Successfully deleted model {model_id}")
-        else:
-            logger.warning(f"Model {model_id} was not found on disk")
-
-        # Emit pending status to reset UI state, then remove from local tracking
-        if model_id in self.download_status:
-            current_status = self.download_status[model_id]
-            pending = DownloadPending(
-                shard_metadata=current_status.shard_metadata,
-                node_id=self.node_id,
-            )
-            await self.event_sender.send(
-                NodeDownloadProgress(download_progress=pending)
-            )
-            del self.download_status[model_id]
-
-    async def _forward_events(self) -> None:
-        with self.event_receiver as events:
-            async for event in events:
-                idx = next(self.event_index_counter)
-                fe = ForwarderEvent(
-                    origin_idx=idx,
-                    origin=self.node_id,
-                    session=self.session_id,
-                    event=event,
-                )
-                logger.debug(
-                    f"DownloadCoordinator published event {idx}: {str(event)[:100]}"
-                )
-                await self.local_event_sender.send(fe)
-
-    async def _emit_existing_download_progress(self) -> None:
-        try:
-            while True:
-                logger.info(
-                    "DownloadCoordinator: Fetching and emitting existing download progress..."
-                )
-                async for (
-                    _,
-                    progress,
-                ) in self.shard_downloader.get_shard_download_status():
-                    if progress.status == "complete":
-                        status: DownloadProgress = DownloadCompleted(
-                            node_id=self.node_id,
-                            shard_metadata=progress.shard,
-                            total_bytes=progress.total_bytes,
-                        )
-                    elif progress.status in ["in_progress", "not_started"]:
-                        if progress.downloaded_bytes_this_session.in_bytes == 0:
-                            status = DownloadPending(
-                                node_id=self.node_id, shard_metadata=progress.shard
-                            )
-                        else:
-                            status = DownloadOngoing(
-                                node_id=self.node_id,
-                                shard_metadata=progress.shard,
-                                download_progress=map_repo_download_progress_to_download_progress_data(
-                                    progress
-                                ),
-                            )
-                    else:
-                        continue
-
-                    self.download_status[progress.shard.model_card.model_id] = status
-                    await self.event_sender.send(
-                        NodeDownloadProgress(download_progress=status)
-                    )
-                logger.info(
-                    "DownloadCoordinator: Done emitting existing download progress."
-                )
-                await anyio.sleep(5 * 60)  # 5 minutes
-        except Exception as e:
-            logger.error(
-                f"DownloadCoordinator: Error emitting existing download progress: {e}"
-            )
--- a/src/exo/main.py
+++ b/src/exo/main.py
@@ -1,11 +1,10 @@
 import argparse
-import itertools
 import multiprocessing as mp
 import os
 import resource
 import signal
 from dataclasses import dataclass, field
-from typing import Iterator, Self
+from typing import Self

 import anyio
 from anyio.abc import TaskGroup
@@ -13,8 +12,6 @@ from loguru import logger
 from pydantic import PositiveInt

 import exo.routing.topics as topics
-from exo.download.coordinator import DownloadCoordinator
-from exo.download.impl_shard_downloader import exo_shard_downloader
 from exo.master.api import API  # TODO: should API be in master?
 from exo.master.main import Master
 from exo.routing.router import Router, get_node_id_keypair
@@ -24,6 +21,7 @@ from exo.shared.logging import logger_cleanup, logger_setup
 from exo.shared.types.common import NodeId, SessionId
 from exo.utils.channels import Receiver, channel
 from exo.utils.pydantic_ext import CamelCaseModel
+from exo.worker.download.impl_shard_downloader import exo_shard_downloader
 from exo.worker.main import Worker


@@ -31,7 +29,6 @@ from exo.worker.main import Worker
@dataclass
 class Node:
    router: Router
-    download_coordinator: DownloadCoordinator | None
    worker: Worker | None
    election: Election  # Every node participates in election, as we do want a node to become master even if it isn't a master candidate if no master candidates are present.
    election_result_receiver: Receiver[ElectionResult]
@@ -39,7 +36,6 @@ class Node:
    api: API | None

    node_id: NodeId
-    event_index_counter: Iterator[int]
    _tg: TaskGroup = field(init=False, default_factory=anyio.create_task_group)

    @classmethod
@@ -53,26 +49,8 @@ class Node:
        await router.register_topic(topics.COMMANDS)
        await router.register_topic(topics.ELECTION_MESSAGES)
        await router.register_topic(topics.CONNECTION_MESSAGES)
-        await router.register_topic(topics.DOWNLOAD_COMMANDS)

        logger.info(f"Starting node {node_id}")
-
-        # Create shared event index counter for Worker and DownloadCoordinator
-        event_index_counter = itertools.count()
-
-        # Create DownloadCoordinator (unless --no-downloads)
-        if not args.no_downloads:
-            download_coordinator = DownloadCoordinator(
-                node_id,
-                session_id,
-                exo_shard_downloader(),
-                download_command_receiver=router.receiver(topics.DOWNLOAD_COMMANDS),
-                local_event_sender=router.sender(topics.LOCAL_EVENTS),
-                event_index_counter=event_index_counter,
-            )
-        else:
-            download_coordinator = None
-
        if args.spawn_api:
            api = API(
                node_id,
@@ -80,7 +58,6 @@ class Node:
                port=args.api_port,
                global_event_receiver=router.receiver(topics.GLOBAL_EVENTS),
                command_sender=router.sender(topics.COMMANDS),
-                download_command_sender=router.sender(topics.DOWNLOAD_COMMANDS),
                election_receiver=router.receiver(topics.ELECTION_MESSAGES),
            )
        else:
@@ -90,12 +67,11 @@ class Node:
            worker = Worker(
                node_id,
                session_id,
+                exo_shard_downloader(),
                connection_message_receiver=router.receiver(topics.CONNECTION_MESSAGES),
                global_event_receiver=router.receiver(topics.GLOBAL_EVENTS),
                local_event_sender=router.sender(topics.LOCAL_EVENTS),
                command_sender=router.sender(topics.COMMANDS),
-                download_command_sender=router.sender(topics.DOWNLOAD_COMMANDS),
-                event_index_counter=event_index_counter,
            )
        else:
            worker = None
@@ -123,25 +99,13 @@ class Node:
            election_result_sender=er_send,
        )

-        return cls(
-            router,
-            download_coordinator,
-            worker,
-            election,
-            er_recv,
-            master,
-            api,
-            node_id,
-            event_index_counter,
-        )
+        return cls(router, worker, election, er_recv, master, api, node_id)

    async def run(self):
        async with self._tg as tg:
            signal.signal(signal.SIGINT, lambda _, __: self.shutdown())
            tg.start_soon(self.router.run)
            tg.start_soon(self.election.run)
-            if self.download_coordinator:
-                tg.start_soon(self.download_coordinator.run)
            if self.worker:
                tg.start_soon(self.worker.run)
            if self.master:
@@ -206,27 +170,13 @@ class Node:
                    )
                if result.is_new_master:
                    await anyio.sleep(0)
-                    # Fresh counter for new session (buffer expects indices from 0)
-                    self.event_index_counter = itertools.count()
-                    if self.download_coordinator:
-                        self.download_coordinator.shutdown()
-                        self.download_coordinator = DownloadCoordinator(
-                            self.node_id,
-                            result.session_id,
-                            exo_shard_downloader(),
-                            download_command_receiver=self.router.receiver(
-                                topics.DOWNLOAD_COMMANDS
-                            ),
-                            local_event_sender=self.router.sender(topics.LOCAL_EVENTS),
-                            event_index_counter=self.event_index_counter,
-                        )
-                        self._tg.start_soon(self.download_coordinator.run)
                    if self.worker:
                        self.worker.shutdown()
                        # TODO: add profiling etc to resource monitor
                        self.worker = Worker(
                            self.node_id,
                            result.session_id,
+                            exo_shard_downloader(),
                            connection_message_receiver=self.router.receiver(
                                topics.CONNECTION_MESSAGES
                            ),
@@ -235,10 +185,6 @@ class Node:
                            ),
                            local_event_sender=self.router.sender(topics.LOCAL_EVENTS),
                            command_sender=self.router.sender(topics.COMMANDS),
-                            download_command_sender=self.router.sender(
-                                topics.DOWNLOAD_COMMANDS
-                            ),
-                            event_index_counter=self.event_index_counter,
                        )
                        self._tg.start_soon(self.worker.run)
                    if self.api:
@@ -280,7 +226,6 @@ class Args(CamelCaseModel):
    api_port: PositiveInt = 52415
    tb_only: bool = False
    no_worker: bool = False
-    no_downloads: bool = False
    fast_synch: bool | None = None  # None = auto, True = force on, False = force off

    @classmethod
@@ -323,11 +268,6 @@ class Args(CamelCaseModel):
            "--no-worker",
            action="store_true",
        )
-        parser.add_argument(
-            "--no-downloads",
-            action="store_true",
-            help="Disable the download coordinator (node won't download models)",
-        )
        fast_synch_group = parser.add_mutually_exclusive_group()
        fast_synch_group.add_argument(
            "--fast-synch",
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -44,7 +44,6 @@ from exo.shared.types.api import (
    ChatCompletionResponse,
    CreateInstanceParams,
    CreateInstanceResponse,
-    DeleteDownloadResponse,
    DeleteInstanceResponse,
    ErrorInfo,
    ErrorResponse,
@@ -62,8 +61,6 @@ from exo.shared.types.api import (
    PlaceInstanceParams,
    PlacementPreview,
    PlacementPreviewResponse,
-    StartDownloadParams,
-    StartDownloadResponse,
    StreamingChoiceResponse,
    ToolCall,
 )
@@ -78,16 +75,12 @@ from exo.shared.types.commands import (
    ChatCompletion,
    Command,
    CreateInstance,
-    DeleteDownload,
    DeleteInstance,
-    DownloadCommand,
    ForwarderCommand,
-    ForwarderDownloadCommand,
    ImageEdits,
    ImageGeneration,
    PlaceInstance,
    SendInputChunk,
-    StartDownload,
    TaskFinished,
 )
 from exo.shared.types.common import CommandId, Id, NodeId, SessionId
@@ -163,14 +156,12 @@ class API:
        # Ideally this would be a MasterForwarderEvent but type system says no :(
        global_event_receiver: Receiver[ForwarderEvent],
        command_sender: Sender[ForwarderCommand],
-        download_command_sender: Sender[ForwarderDownloadCommand],
        # This lets us pause the API if an election is running
        election_receiver: Receiver[ElectionMessage],
    ) -> None:
        self.state = State()
        self._event_log: list[Event] = []
        self.command_sender = command_sender
-        self.download_command_sender = download_command_sender
        self.global_event_receiver = global_event_receiver
        self.election_receiver = election_receiver
        self.event_buffer: OrderedBuffer[Event] = OrderedBuffer[Event]()
@@ -269,8 +260,6 @@ class API:
        self.app.get("/images/{image_id}")(self.get_image)
        self.app.get("/state")(lambda: self.state)
        self.app.get("/events")(lambda: self._event_log)
-        self.app.post("/download/start")(self.start_download)
-        self.app.delete("/download/{node_id}/{model_id:path}")(self.delete_download)

    async def place_instance(self, payload: PlaceInstanceParams):
        command = PlaceInstance(
@@ -356,9 +345,14 @@ class API:
    ) -> PlacementPreviewResponse:
        seen: set[tuple[ModelId, Sharding, InstanceMeta, int]] = set()
        previews: list[PlacementPreview] = []
-        required_nodes = set(node_ids) if node_ids else None

-        if len(list(self.state.topology.list_nodes())) == 0:
+        # Create filtered topology if node_ids specified
+        if node_ids and len(node_ids) > 0:
+            topology = self.state.topology.get_subgraph_from_nodes(node_ids)
+        else:
+            topology = self.state.topology
+
+        if len(list(topology.list_nodes())) == 0:
            return PlacementPreviewResponse(previews=[])

        cards = [card for card in MODEL_CARDS.values() if card.model_id == model_id]
@@ -371,9 +365,7 @@ class API:
                instance_combinations.extend(
                    [
                        (sharding, instance_meta, i)
-                        for i in range(
-                            1, len(list(self.state.topology.list_nodes())) + 1
-                        )
+                        for i in range(1, len(list(topology.list_nodes())) + 1)
                    ]
                )
        # TODO: PDD
@@ -391,9 +383,8 @@ class API:
                        ),
                        node_memory=self.state.node_memory,
                        node_network=self.state.node_network,
-                        topology=self.state.topology,
+                        topology=topology,
                        current_instances=self.state.instances,
-                        required_nodes=required_nodes,
                    )
                except ValueError as exc:
                    if (model_card.model_id, sharding, instance_meta, 0) not in seen:
@@ -432,16 +423,14 @@ class API:

                instance = new_instances[0]
                shard_assignments = instance.shard_assignments
-                placement_node_ids = list(shard_assignments.node_to_runner.keys())
+                node_ids = list(shard_assignments.node_to_runner.keys())

                memory_delta_by_node: dict[str, int] = {}
-                if placement_node_ids:
+                if node_ids:
                    total_bytes = model_card.storage_size.in_bytes
-                    per_node = total_bytes // len(placement_node_ids)
-                    remainder = total_bytes % len(placement_node_ids)
-                    for index, node_id in enumerate(
-                        sorted(placement_node_ids, key=str)
-                    ):
+                    per_node = total_bytes // len(node_ids)
+                    remainder = total_bytes % len(node_ids)
+                    for index, node_id in enumerate(sorted(node_ids, key=str)):
                        extra = 1 if index < remainder else 0
                        memory_delta_by_node[str(node_id)] = per_node + extra

@@ -449,7 +438,7 @@ class API:
                    model_card.model_id,
                    sharding,
                    instance_meta,
-                    len(placement_node_ids),
+                    len(node_ids),
                ) not in seen:
                    previews.append(
                        PlacementPreview(
@@ -461,14 +450,7 @@ class API:
                            error=None,
                        )
                    )
-                seen.add(
-                    (
-                        model_card.model_id,
-                        sharding,
-                        instance_meta,
-                        len(placement_node_ids),
-                    )
-                )
+                seen.add((model_card.model_id, sharding, instance_meta, len(node_ids)))

        return PlacementPreviewResponse(previews=previews)

@@ -1310,28 +1292,3 @@ class API:
        await self.command_sender.send(
            ForwarderCommand(origin=self.node_id, command=command)
        )
-
-    async def _send_download(self, command: DownloadCommand):
-        await self.download_command_sender.send(
-            ForwarderDownloadCommand(origin=self.node_id, command=command)
-        )
-
-    async def start_download(
-        self, payload: StartDownloadParams
-    ) -> StartDownloadResponse:
-        command = StartDownload(
-            target_node_id=payload.target_node_id,
-            shard_metadata=payload.shard_metadata,
-        )
-        await self._send_download(command)
-        return StartDownloadResponse(command_id=command.command_id)
-
-    async def delete_download(
-        self, node_id: NodeId, model_id: ModelId
-    ) -> DeleteDownloadResponse:
-        command = DeleteDownload(
-            target_node_id=node_id,
-            model_id=ModelId(model_id),
-        )
-        await self._send_download(command)
-        return DeleteDownloadResponse(command_id=command.command_id)
--- a/src/exo/master/placement.py
+++ b/src/exo/master/placement.py
@@ -35,7 +35,7 @@ from exo.shared.types.worker.shards import Sharding

 def random_ephemeral_port() -> int:
    port = random.randint(49153, 65535)
-    return port - 1 if port <= 52415 else port
+    return port - 1 if port <= 52415 else 52414


 def add_instance_to_placements(
@@ -54,18 +54,9 @@ def place_instance(
    current_instances: Mapping[InstanceId, Instance],
    node_memory: Mapping[NodeId, MemoryUsage],
    node_network: Mapping[NodeId, NodeNetworkInfo],
-    required_nodes: set[NodeId] | None = None,
 ) -> dict[InstanceId, Instance]:
    cycles = topology.get_cycles()
    candidate_cycles = list(filter(lambda it: len(it) >= command.min_nodes, cycles))
-
-    # Filter to cycles containing all required nodes (subset matching)
-    if required_nodes:
-        candidate_cycles = [
-            cycle
-            for cycle in candidate_cycles
-            if required_nodes.issubset(cycle.node_ids)
-        ]
    cycles_with_sufficient_memory = filter_cycles_by_memory(
        candidate_cycles, node_memory, command.model_card.storage_size
    )
--- a/src/exo/routing/topics.py
+++ b/src/exo/routing/topics.py
@@ -3,7 +3,7 @@ from enum import Enum

 from exo.routing.connection_message import ConnectionMessage
 from exo.shared.election import ElectionMessage
-from exo.shared.types.commands import ForwarderCommand, ForwarderDownloadCommand
+from exo.shared.types.commands import ForwarderCommand
 from exo.shared.types.events import (
    ForwarderEvent,
 )
@@ -45,6 +45,3 @@ ELECTION_MESSAGES = TypedTopic(
 CONNECTION_MESSAGES = TypedTopic(
    "connection_messages", PublishPolicy.Never, ConnectionMessage
 )
-DOWNLOAD_COMMANDS = TypedTopic(
-    "download_commands", PublishPolicy.Always, ForwarderDownloadCommand
-)
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -40,7 +40,6 @@ class ModelCard(CamelCaseModel):
    supports_tensor: bool
    tasks: list[ModelTask]
    components: list[ComponentInfo] | None = None
-    quantization: int | None = None

    @field_validator("tasks", mode="before")
    @classmethod
@@ -414,7 +413,7 @@ MODEL_CARDS: dict[str, ModelCard] = {
    ),
 }

-_IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
+_IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
    "flux1-schnell": ModelCard(
        model_id=ModelId("black-forest-labs/FLUX.1-schnell"),
        storage_size=Memory.from_bytes(23782357120 + 9524621312),
@@ -429,7 +428,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
                storage_size=Memory.from_kb(0),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,
+                safetensors_index_filename=None,  # Single file
            ),
            ComponentInfo(
                component_name="text_encoder_2",
@@ -443,7 +442,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
                component_name="transformer",
                component_path="transformer/",
                storage_size=Memory.from_bytes(23782357120),
-                n_layers=57,
+                n_layers=57,  # 19 transformer_blocks + 38 single_transformer_blocks
                can_shard=True,
                safetensors_index_filename="diffusion_pytorch_model.safetensors.index.json",
            ),
@@ -459,7 +458,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
    ),
    "flux1-dev": ModelCard(
        model_id=ModelId("black-forest-labs/FLUX.1-dev"),
-        storage_size=Memory.from_bytes(23802816640 + 9524621312),
+        storage_size=Memory.from_bytes(23782357120 + 9524621312),
        n_layers=57,
        hidden_size=1,
        supports_tensor=False,
@@ -471,7 +470,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
                storage_size=Memory.from_kb(0),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,
+                safetensors_index_filename=None,  # Single file
            ),
            ComponentInfo(
                component_name="text_encoder_2",
@@ -485,49 +484,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
                component_name="transformer",
                component_path="transformer/",
                storage_size=Memory.from_bytes(23802816640),
-                n_layers=57,
-                can_shard=True,
-                safetensors_index_filename="diffusion_pytorch_model.safetensors.index.json",
-            ),
-            ComponentInfo(
-                component_name="vae",
-                component_path="vae/",
-                storage_size=Memory.from_kb(0),
-                n_layers=None,
-                can_shard=False,
-                safetensors_index_filename=None,
-            ),
-        ],
-    ),
-    "flux1-krea-dev": ModelCard(
-        model_id=ModelId("black-forest-labs/FLUX.1-Krea-dev"),
-        storage_size=Memory.from_bytes(23802816640 + 9524621312),  # Same as dev
-        n_layers=57,
-        hidden_size=1,
-        supports_tensor=False,
-        tasks=[ModelTask.TextToImage],
-        components=[
-            ComponentInfo(
-                component_name="text_encoder",
-                component_path="text_encoder/",
-                storage_size=Memory.from_kb(0),
-                n_layers=12,
-                can_shard=False,
-                safetensors_index_filename=None,
-            ),
-            ComponentInfo(
-                component_name="text_encoder_2",
-                component_path="text_encoder_2/",
-                storage_size=Memory.from_bytes(9524621312),
-                n_layers=24,
-                can_shard=False,
-                safetensors_index_filename="model.safetensors.index.json",
-            ),
-            ComponentInfo(
-                component_name="transformer",
-                component_path="transformer/",
-                storage_size=Memory.from_bytes(23802816640),
-                n_layers=57,
+                n_layers=57,  # 19 transformer_blocks + 38 single_transformer_blocks
                can_shard=True,
                safetensors_index_filename="diffusion_pytorch_model.safetensors.index.json",
            ),
@@ -544,7 +501,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
    "qwen-image": ModelCard(
        model_id=ModelId("Qwen/Qwen-Image"),
        storage_size=Memory.from_bytes(16584333312 + 40860802176),
-        n_layers=60,
+        n_layers=60,  # Qwen has 60 transformer blocks (all joint-style)
        hidden_size=1,
        supports_tensor=False,
        tasks=[ModelTask.TextToImage],
@@ -552,10 +509,10 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
            ComponentInfo(
                component_name="text_encoder",
                component_path="text_encoder/",
-                storage_size=Memory.from_bytes(16584333312),
+                storage_size=Memory.from_kb(16584333312),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,
+                safetensors_index_filename=None,  # Single file
            ),
            ComponentInfo(
                component_name="transformer",
@@ -578,7 +535,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
    "qwen-image-edit-2509": ModelCard(
        model_id=ModelId("Qwen/Qwen-Image-Edit-2509"),
        storage_size=Memory.from_bytes(16584333312 + 40860802176),
-        n_layers=60,
+        n_layers=60,  # Qwen has 60 transformer blocks (all joint-style)
        hidden_size=1,
        supports_tensor=False,
        tasks=[ModelTask.ImageToImage],
@@ -586,10 +543,10 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
            ComponentInfo(
                component_name="text_encoder",
                component_path="text_encoder/",
-                storage_size=Memory.from_bytes(16584333312),
+                storage_size=Memory.from_kb(16584333312),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,
+                safetensors_index_filename=None,  # Single file
            ),
            ComponentInfo(
                component_name="transformer",
@@ -611,93 +568,6 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
    ),
 }

-
-def _create_image_model_quant_variants(
-    base_name: str,
-    base_card: ModelCard,
-) -> dict[str, ModelCard]:
-    """Create quantized variants of an image model card.
-
-    Only the transformer component is quantized; text encoders stay at bf16.
-    Sizes are calculated exactly from the base card's component sizes.
-    """
-    if base_card.components is None:
-        raise ValueError(f"Image model {base_name} must have components defined")
-
-    quantizations = [8, 6, 5, 4, 3]
-
-    num_transformer_bytes = next(
-        c.storage_size.in_bytes
-        for c in base_card.components
-        if c.component_name == "transformer"
-    )
-
-    transformer_bytes = Memory.from_bytes(num_transformer_bytes)
-
-    remaining_bytes = Memory.from_bytes(
-        sum(
-            c.storage_size.in_bytes
-            for c in base_card.components
-            if c.component_name != "transformer"
-        )
-    )
-
-    def with_transformer_size(new_size: Memory) -> list[ComponentInfo]:
-        assert base_card.components is not None
-        return [
-            ComponentInfo(
-                component_name=c.component_name,
-                component_path=c.component_path,
-                storage_size=new_size
-                if c.component_name == "transformer"
-                else c.storage_size,
-                n_layers=c.n_layers,
-                can_shard=c.can_shard,
-                safetensors_index_filename=c.safetensors_index_filename,
-            )
-            for c in base_card.components
-        ]
-
-    variants = {
-        base_name: ModelCard(
-            model_id=base_card.model_id,
-            storage_size=transformer_bytes + remaining_bytes,
-            n_layers=base_card.n_layers,
-            hidden_size=base_card.hidden_size,
-            supports_tensor=base_card.supports_tensor,
-            tasks=base_card.tasks,
-            components=with_transformer_size(transformer_bytes),
-            quantization=None,
-        )
-    }
-
-    for quant in quantizations:
-        quant_transformer_bytes = Memory.from_bytes(
-            (num_transformer_bytes * quant) // 16
-        )
-        total_bytes = remaining_bytes + quant_transformer_bytes
-
-        model_id = base_card.model_id + f"-{quant}bit"
-
-        variants[f"{base_name}-{quant}bit"] = ModelCard(
-            model_id=ModelId(model_id),
-            storage_size=total_bytes,
-            n_layers=base_card.n_layers,
-            hidden_size=base_card.hidden_size,
-            supports_tensor=base_card.supports_tensor,
-            tasks=base_card.tasks,
-            components=with_transformer_size(quant_transformer_bytes),
-            quantization=quant,
-        )
-
-    return variants
-
-
-_image_model_cards: dict[str, ModelCard] = {}
-for _base_name, _base_card in _IMAGE_BASE_MODEL_CARDS.items():
-    _image_model_cards |= _create_image_model_quant_variants(_base_name, _base_card)
-_IMAGE_MODEL_CARDS = _image_model_cards
-
 if EXO_ENABLE_IMAGE_MODELS:
    MODEL_CARDS.update(_IMAGE_MODEL_CARDS)

@@ -751,7 +621,7 @@ class ConfigData(BaseModel):

 async def get_config_data(model_id: ModelId) -> ConfigData:
    """Downloads and parses config.json for a model."""
-    from exo.download.download_utils import (
+    from exo.worker.download.download_utils import (
        download_file_with_retry,
        ensure_models_dir,
    )
@@ -773,11 +643,11 @@ async def get_config_data(model_id: ModelId) -> ConfigData:

 async def get_safetensors_size(model_id: ModelId) -> Memory:
    """Gets model size from safetensors index or falls back to HF API."""
-    from exo.download.download_utils import (
+    from exo.shared.types.worker.downloads import ModelSafetensorsIndex
+    from exo.worker.download.download_utils import (
        download_file_with_retry,
        ensure_models_dir,
    )
-    from exo.shared.types.worker.downloads import ModelSafetensorsIndex

    target_dir = (await ensure_models_dir()) / model_id.normalize()
    await aios.makedirs(target_dir, exist_ok=True)
--- a/src/exo/shared/topology.py
+++ b/src/exo/shared/topology.py
@@ -248,8 +248,8 @@ class Topology:
    ) -> list[list[NodeId]]:
        """
        Find cycles in the Thunderbolt topology where all nodes have TB bridge enabled.
-        Only returns cycles with >=2 nodes (2+ machines in a loop), as
-        1 node doesn't cause the broadcast storm problem.
+        Only returns cycles with >2 nodes (3+ machines in a loop), as cycles with
+        2 or fewer nodes don't cause the broadcast storm problem.
        """
        enabled_nodes = {
            node_id
@@ -257,7 +257,7 @@ class Topology:
            if status.enabled
        }

-        if len(enabled_nodes) < 2:
+        if len(enabled_nodes) < 3:
            return []

        thunderbolt_ips = _get_ips_with_interface_type(
@@ -288,7 +288,7 @@ class Topology:
        return [
            [graph[idx] for idx in cycle]
            for cycle in rx.simple_cycles(graph)
-            if len(cycle) >= 2
+            if len(cycle) > 2
        ]


--- a/src/exo/shared/types/api.py
+++ b/src/exo/shared/types/api.py
@@ -7,11 +7,10 @@ from pydantic import BaseModel, Field, field_validator
 from pydantic_core import PydanticUseDefault

 from exo.shared.models.model_cards import ModelCard, ModelId
-from exo.shared.types.common import CommandId, NodeId
+from exo.shared.types.common import CommandId
 from exo.shared.types.memory import Memory
 from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
-from exo.shared.types.worker.shards import Sharding, ShardMetadata
-from exo.utils.pydantic_ext import CamelCaseModel
+from exo.shared.types.worker.shards import Sharding

 FinishReason = Literal[
    "stop", "length", "tool_calls", "content_filter", "function_call", "error"
@@ -353,16 +352,3 @@ class ImageListItem(BaseModel, frozen=True):

 class ImageListResponse(BaseModel, frozen=True):
    data: list[ImageListItem]
-
-
-class StartDownloadParams(CamelCaseModel):
-    target_node_id: NodeId
-    shard_metadata: ShardMetadata
-
-
-class StartDownloadResponse(CamelCaseModel):
-    command_id: CommandId
-
-
-class DeleteDownloadResponse(CamelCaseModel):
-    command_id: CommandId
--- a/src/exo/shared/types/commands.py
+++ b/src/exo/shared/types/commands.py
@@ -1,6 +1,6 @@
 from pydantic import Field

-from exo.shared.models.model_cards import ModelCard, ModelId
+from exo.shared.models.model_cards import ModelCard
 from exo.shared.types.api import (
    ChatCompletionTaskParams,
    ImageEditsInternalParams,
@@ -9,7 +9,7 @@ from exo.shared.types.api import (
 from exo.shared.types.chunks import InputImageChunk
 from exo.shared.types.common import CommandId, NodeId
 from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
-from exo.shared.types.worker.shards import Sharding, ShardMetadata
+from exo.shared.types.worker.shards import Sharding
 from exo.utils.pydantic_ext import CamelCaseModel, TaggedModel


@@ -62,19 +62,6 @@ class RequestEventLog(BaseCommand):
    since_idx: int


-class StartDownload(BaseCommand):
-    target_node_id: NodeId
-    shard_metadata: ShardMetadata
-
-
-class DeleteDownload(BaseCommand):
-    target_node_id: NodeId
-    model_id: ModelId
-
-
-DownloadCommand = StartDownload | DeleteDownload
-
-
 Command = (
    TestCommand
    | RequestEventLog
@@ -92,8 +79,3 @@ Command = (
 class ForwarderCommand(CamelCaseModel):
    origin: NodeId
    command: Command
-
-
-class ForwarderDownloadCommand(CamelCaseModel):
-    origin: NodeId
-    command: DownloadCommand
--- a/src/exo/shared/types/mlx.py
+++ b/src/exo/shared/types/mlx.py
@@ -0,0 +1,11 @@
+"""Shared types for MLX-related functionality."""
+
+from mlx_lm.models.cache import (
+    KVCache,
+    QuantizedKVCache,
+    RotatingKVCache,
+)
+
+# Type alias for KV cache - matches make_kv_cache return type
+# This list contains one cache entry per transformer layer
+KVCacheType = list[KVCache | RotatingKVCache | QuantizedKVCache]
--- a/src/exo/utils/keyed_backoff.py
+++ b/src/exo/utils/keyed_backoff.py
@@ -1,32 +0,0 @@
-import time
-from typing import Generic, TypeVar
-
-K = TypeVar("K")
-
-
-class KeyedBackoff(Generic[K]):
-    """Tracks exponential backoff state per key."""
-
-    def __init__(self, base: float = 0.5, cap: float = 10.0):
-        self._base = base
-        self._cap = cap
-        self._attempts: dict[K, int] = {}
-        self._last_time: dict[K, float] = {}
-
-    def should_proceed(self, key: K) -> bool:
-        """Returns True if enough time has elapsed since last attempt."""
-        now = time.monotonic()
-        last = self._last_time.get(key, 0.0)
-        attempts = self._attempts.get(key, 0)
-        delay = min(self._cap, self._base * (2.0**attempts))
-        return now - last >= delay
-
-    def record_attempt(self, key: K) -> None:
-        """Record that an attempt was made for this key."""
-        self._last_time[key] = time.monotonic()
-        self._attempts[key] = self._attempts.get(key, 0) + 1
-
-    def reset(self, key: K) -> None:
-        """Reset backoff state for a key (e.g., on success)."""
-        self._attempts.pop(key, None)
-        self._last_time.pop(key, None)
--- a/src/exo/worker/download/download_utils.py
+++ b/src/exo/worker/download/download_utils.py
@@ -24,15 +24,7 @@ from pydantic import (
    TypeAdapter,
 )

-from exo.download.huggingface_utils import (
-    filter_repo_objects,
-    get_allow_patterns,
-    get_auth_headers,
-    get_hf_endpoint,
-    get_hf_token,
-)
 from exo.shared.constants import EXO_MODELS_DIR
-from exo.shared.models.model_cards import ModelTask
 from exo.shared.types.common import ModelId
 from exo.shared.types.memory import Memory
 from exo.shared.types.worker.downloads import (
@@ -43,6 +35,13 @@ from exo.shared.types.worker.downloads import (
    RepoFileDownloadProgress,
 )
 from exo.shared.types.worker.shards import ShardMetadata
+from exo.worker.download.huggingface_utils import (
+    filter_repo_objects,
+    get_allow_patterns,
+    get_auth_headers,
+    get_hf_endpoint,
+    get_hf_token,
+)


 class HuggingFaceAuthenticationError(Exception):
@@ -482,11 +481,6 @@ async def resolve_allow_patterns(shard: ShardMetadata) -> list[str]:
        return ["*"]


-def is_image_model(shard: ShardMetadata) -> bool:
-    tasks = shard.model_card.tasks
-    return ModelTask.TextToImage in tasks or ModelTask.ImageToImage in tasks
-
-
 async def get_downloaded_size(path: Path) -> int:
    partial_path = path.with_suffix(path.suffix + ".partial")
    if await aios.path.exists(path):
@@ -528,15 +522,6 @@ async def download_shard(
            file_list, allow_patterns=allow_patterns, key=lambda x: x.path
        )
    )
-
-    # For image models, skip root-level safetensors files since weights
-    # are stored in component subdirectories (e.g., transformer/, vae/)
-    if is_image_model(shard):
-        filtered_file_list = [
-            f
-            for f in filtered_file_list
-            if "/" in f.path or not f.path.endswith(".safetensors")
-        ]
    file_progress: dict[str, RepoFileDownloadProgress] = {}

    async def on_progress_wrapper(
--- a/src/exo/worker/download/huggingface_utils.py
+++ b/src/exo/worker/download/huggingface_utils.py
--- a/src/exo/worker/download/impl_shard_downloader.py
+++ b/src/exo/worker/download/impl_shard_downloader.py
@@ -5,13 +5,13 @@ from typing import AsyncIterator, Callable

 from loguru import logger

-from exo.download.download_utils import RepoDownloadProgress, download_shard
-from exo.download.shard_downloader import ShardDownloader
 from exo.shared.models.model_cards import MODEL_CARDS, ModelCard, ModelId
 from exo.shared.types.worker.shards import (
    PipelineShardMetadata,
    ShardMetadata,
 )
+from exo.worker.download.download_utils import RepoDownloadProgress, download_shard
+from exo.worker.download.shard_downloader import ShardDownloader


 def exo_shard_downloader(max_parallel_downloads: int = 8) -> ShardDownloader:
--- a/src/exo/worker/download/shard_downloader.py
+++ b/src/exo/worker/download/shard_downloader.py
@@ -5,13 +5,13 @@ from datetime import timedelta
 from pathlib import Path
 from typing import AsyncIterator, Callable

-from exo.download.download_utils import RepoDownloadProgress
 from exo.shared.models.model_cards import ModelCard, ModelId, ModelTask
 from exo.shared.types.memory import Memory
 from exo.shared.types.worker.shards import (
    PipelineShardMetadata,
    ShardMetadata,
 )
+from exo.worker.download.download_utils import RepoDownloadProgress


 # TODO: the PipelineShardMetadata getting reinstantiated is a bit messy. Should this be a classmethod?
--- a/src/exo/worker/engines/image/distributed_model.py
+++ b/src/exo/worker/engines/image/distributed_model.py
@@ -6,10 +6,10 @@ import mlx.core as mx
 from mflux.models.common.config.config import Config
 from PIL import Image

-from exo.download.download_utils import build_model_path
 from exo.shared.types.api import AdvancedImageParams
 from exo.shared.types.worker.instances import BoundInstance
 from exo.shared.types.worker.shards import PipelineShardMetadata
+from exo.worker.download.download_utils import build_model_path
 from exo.worker.engines.image.config import ImageModelConfig
 from exo.worker.engines.image.models import (
    create_adapter_for_model,
@@ -71,10 +71,8 @@ class DistributedImageModel:
    def from_bound_instance(
        cls, bound_instance: BoundInstance
    ) -> "DistributedImageModel":
-        model_card = bound_instance.bound_shard.model_card
-        model_id = model_card.model_id
+        model_id = bound_instance.bound_shard.model_card.model_id
        model_path = build_model_path(model_id)
-        quantize = model_card.quantization

        shard_metadata = bound_instance.bound_shard
        if not isinstance(shard_metadata, PipelineShardMetadata):
@@ -95,7 +93,6 @@ class DistributedImageModel:
            local_path=model_path,
            shard_metadata=shard_metadata,
            group=group,
-            quantize=quantize,
        )

    def get_steps_for_quality(self, quality: Literal["low", "medium", "high"]) -> int:
@@ -143,7 +140,6 @@ class DistributedImageModel:
            width=width,
            image_path=image_path,
            model_config=self._adapter.model.model_config,  # pyright: ignore[reportAny]
-            guidance=guidance_override if guidance_override is not None else 4.0,
        )

        num_sync_steps = self._config.get_num_sync_steps(steps)
--- a/src/exo/worker/engines/image/models/init.py
+++ b/src/exo/worker/engines/image/models/init.py
@@ -33,7 +33,6 @@ _ADAPTER_REGISTRY: dict[str, AdapterFactory] = {
 # Config registry: maps model ID patterns to configs
 _CONFIG_REGISTRY: dict[str, ImageModelConfig] = {
    "flux.1-schnell": FLUX_SCHNELL_CONFIG,
-    "flux.1-krea-dev": FLUX_DEV_CONFIG,  # Must come before "flux.1-dev" for pattern matching
    "flux.1-dev": FLUX_DEV_CONFIG,
    "qwen-image-edit": QWEN_IMAGE_EDIT_CONFIG,  # Must come before "qwen-image" for pattern matching
    "qwen-image": QWEN_IMAGE_CONFIG,
--- a/src/exo/worker/engines/mlx/cache.py
+++ b/src/exo/worker/engines/mlx/cache.py
@@ -1,39 +1,74 @@
-# type: ignore
-# TODO: Fix this file, including types!
 from copy import deepcopy
-from typing import Callable
+from typing import Any, cast

 import mlx.core as mx
-from mlx_lm import stream_generate
-from mlx_lm.models.cache import _BaseCache, trim_prompt_cache
+from mlx_lm.models.cache import trim_prompt_cache
 from mlx_lm.tokenizer_utils import TokenizerWrapper

+from exo.shared.types.mlx import KVCacheType
 from exo.worker.engines.mlx import Model
-from exo.worker.engines.mlx.constants import KEEP_KV_SIZE, KV_BITS, KV_GROUP_SIZE
 from exo.worker.engines.mlx.utils_mlx import make_kv_cache
+from exo.worker.runner.bootstrap import logger
+
+# Fraction of device memory above which LRU eviction kicks in
+_MEMORY_PRESSURE_THRESHOLD = 0.85


 class KVPrefixCache:
    def __init__(self):
-        # Only one prefix cache per runner.
        self.prompts: list[mx.array] = []  # mx array of tokens (ints)
-        self.caches: list[list[_BaseCache]] = []
+        self.caches: list[KVCacheType] = []
+        self._last_used: list[int] = []  # monotonic counter of last access per entry
+        self._access_counter: int = 0
+
+    def clear(self):
+        """Clear all cached prompts and caches."""
+        self.prompts.clear()
+        self.caches.clear()
+        self._last_used.clear()

    def add_kv_cache(
-        self, tokenizer: TokenizerWrapper, prompt: str, cache: list[_BaseCache]
+        self, tokenizer: TokenizerWrapper, prompt: str, cache: KVCacheType
    ):
-        tokenized_prompt = self.encode_prompt(tokenizer, prompt)
+        """Add a new cache entry. Evicts LRU entries if memory is high."""
+        self._evict_if_needed()
+        tokenized_prompt = encode_prompt(tokenizer, prompt)
        self.prompts.append(tokenized_prompt)
        self.caches.append(deepcopy(cache))
+        self._access_counter += 1
+        self._last_used.append(self._access_counter)
+        logger.info(f"KV cache added: {len(tokenized_prompt)} tokens")
+
+    def update_kv_cache(
+        self,
+        index: int,
+        tokenizer: TokenizerWrapper,
+        prompt: str,
+        cache: KVCacheType,
+    ):
+        """Update an existing cache entry in-place."""
+        tokenized_prompt = encode_prompt(tokenizer, prompt)
+        self.prompts[index] = tokenized_prompt
+        self.caches[index] = deepcopy(cache)
+        self._access_counter += 1
+        self._last_used[index] = self._access_counter
+        logger.info(f"KV cache updated (index {index}): {len(tokenized_prompt)} tokens")

    def get_kv_cache(
        self,
        model: Model,
        tokenizer: TokenizerWrapper,
-        sampler: Callable[[mx.array], mx.array],
        prompt: str,
-    ) -> list[_BaseCache]:
-        tokenized_prompt = self.encode_prompt(tokenizer, prompt)
+    ) -> tuple[KVCacheType, mx.array, int | None]:
+        """Get KV cache for prompt, returning remaining tokens to prefill.
+
+        Returns:
+            Tuple of (cache, remaining_tokens, matched_index) where:
+            - cache: KV cache to use for generation
+            - remaining_tokens: tokens that still need prefilling
+            - matched_index: index of the matched entry (None if no match)
+        """
+        tokenized_prompt = encode_prompt(tokenizer, prompt)
        max_length = len(tokenized_prompt)

        best_snapshot_index, best_snapshot_length = None, 0
@@ -42,63 +77,102 @@ class KVPrefixCache:
            length = _get_prefix_length(tokenized_prompt, cached_prompt)

            if length == max_length:
-                return self.caches[i]
+                # Exact match - cached prompt starts with our entire prompt
+                # Trim cache to prompt length - 1, return last token for stream_generate
+                prompt_cache = deepcopy(self.caches[i])
+                cached_length = _cache_length(self.caches[i])
+                tokens_to_trim = cached_length - (max_length - 1)
+                if tokens_to_trim > 0:
+                    trim_prompt_cache(cast(list[Any], prompt_cache), tokens_to_trim)
+                self._access_counter += 1
+                self._last_used[i] = self._access_counter
+                logger.info(f"KV cache exact match: {max_length} tokens (instant)")
+                return prompt_cache, tokenized_prompt[-1:], i

            if length > best_snapshot_length:
                best_snapshot_index, best_snapshot_length = i, length

        if best_snapshot_index is not None:
-            prompt_cache = deepcopy(self.caches[best_snapshot_index])
-            trim_prompt_cache(prompt_cache, max_length - best_snapshot_length)
-            tokenized_prompt = tokenized_prompt[best_snapshot_index:]
-
-        else:
-            prompt_cache = make_kv_cache(
-                model,
-                # max_kv_size=MAX_KV_SIZE,
-                # keep=KEEP_KV_SIZE
+            new_tokens = max_length - best_snapshot_length
+            logger.info(
+                f"KV cache prefix match: {best_snapshot_length}/{max_length} tokens "
+                f"(reusing {best_snapshot_length}, need to prefill {new_tokens})"
            )

-        prefill(model, tokenizer, sampler, tokenized_prompt, prompt_cache)
+            prompt_cache = deepcopy(self.caches[best_snapshot_index])

-        return prompt_cache
+            # Trim removes tokens from the end, so we trim (cached_length - prefix_length) to keep the prefix
+            cached_length = _cache_length(self.caches[best_snapshot_index])
+            tokens_to_trim = cached_length - best_snapshot_length
+            if tokens_to_trim > 0:
+                trim_prompt_cache(cast(list[Any], prompt_cache), tokens_to_trim)

-    def encode_prompt(self, tokenizer: TokenizerWrapper, prompt: str) -> mx.array:
-        add_special_tokens = tokenizer.bos_token is None or not prompt.startswith(
-            tokenizer.bos_token
-        )
-        tokenized_prompt = tokenizer.encode(
-            prompt, add_special_tokens=add_special_tokens
-        )
-        return mx.array(tokenized_prompt)
+            self._access_counter += 1
+            self._last_used[best_snapshot_index] = self._access_counter
+            remaining_tokens = tokenized_prompt[best_snapshot_length:]
+            return prompt_cache, remaining_tokens, best_snapshot_index
+
+        else:
+            prompt_cache = make_kv_cache(model)
+            if len(self.prompts) == 0:
+                logger.info(f"KV cache empty, need to prefill {max_length} tokens")
+            else:
+                logger.info(
+                    f"KV cache no prefix match, need to prefill {max_length} tokens"
+                )
+
+            return prompt_cache, tokenized_prompt, None
+
+    def _evict_if_needed(self):
+        """Evict least recently used entries while memory pressure is high."""
+        if len(self.caches) == 0:
+            return
+
+        active: int = mx.metal.get_active_memory()
+        limit = int(mx.metal.device_info()["max_recommended_working_set_size"])
+        if active < limit * _MEMORY_PRESSURE_THRESHOLD:
+            return
+
+        # Evict LRU entries until below threshold or only one entry left
+        while len(self.caches) > 0:
+            lru_index = self._last_used.index(min(self._last_used))
+            evicted_tokens = len(self.prompts[lru_index])
+            self.prompts.pop(lru_index)
+            self.caches.pop(lru_index)
+            self._last_used.pop(lru_index)
+            logger.info(
+                f"KV cache evicted LRU entry ({evicted_tokens} tokens) due to memory pressure"
+            )
+
+            active = mx.metal.get_active_memory()
+            if active < limit * _MEMORY_PRESSURE_THRESHOLD:
+                break
+
+
+def encode_prompt(tokenizer: TokenizerWrapper, prompt: str) -> mx.array:
+    """Encode a prompt string to token array.
+
+    For chat-templated prompts (which have their own structure markers like
+    <|im_user|>, <|im_middle|>, etc.), we should NOT add BOS/EOS tokens as
+    that would corrupt the prompt structure.
+    """
+    # Chat templates define their own structure - don't add BOS/EOS
+    tokenized_prompt = tokenizer.encode(prompt, add_special_tokens=False)
+    return mx.array(tokenized_prompt)
+
+
+def _cache_length(cache: KVCacheType) -> int:
+    """Get the number of tokens in a KV cache."""
+    # Use .offset attribute which all cache types have (len() not implemented in older QuantizedKVCache)
+    return max(c.offset for c in cache)  # type: ignore


 def _get_prefix_length(prompt: mx.array, cached_prompt: mx.array) -> int:
-    n = min(int(prompt.shape[0]), int(cached_prompt.shape[0]), KEEP_KV_SIZE)
+    """Find the length of the common prefix between two token arrays."""
+    n = min(int(prompt.shape[0]), int(cached_prompt.shape[0]))
    if n == 0:
        return 0

-    equal = (prompt[:n] == cached_prompt[:n]).astype(mx.int32)
+    equal = mx.equal(prompt[:n], cached_prompt[:n]).astype(mx.int32)
    prefix_mask = mx.cumprod(equal)  # stays 1 until first mismatch, then 0 forever
    return int(mx.sum(prefix_mask).item())
-
-
-def prefill(
-    model: Model,
-    tokenizer: TokenizerWrapper,
-    sampler: Callable[[mx.array], mx.array],
-    prompt: mx.array,
-    cache: list[_BaseCache],
-) -> None:
-    for _ in stream_generate(
-        model=model,
-        tokenizer=tokenizer,
-        prompt=prompt,
-        max_tokens=0,
-        sampler=sampler,
-        prompt_cache=cache,
-        prefill_step_size=2048,
-        kv_group_size=KV_GROUP_SIZE,
-        kv_bits=KV_BITS,
-    ):
-        pass
--- a/src/exo/worker/engines/mlx/constants.py
+++ b/src/exo/worker/engines/mlx/constants.py
@@ -4,7 +4,7 @@
 KV_GROUP_SIZE: int | None = 32
 KV_BITS: int | None = None
 ATTENTION_KV_BITS: int | None = 4
-MAX_TOKENS: int = 8192
+MAX_TOKENS: int = 32168
 MAX_KV_SIZE: int | None = 3200
 KEEP_KV_SIZE: int | None = 1600
 QUANTIZE_MODEL_MODE: str | None = "affine"
--- a/src/exo/worker/engines/mlx/generator/generate.py
+++ b/src/exo/worker/engines/mlx/generator/generate.py
@@ -1,12 +1,12 @@
+import time
 from typing import Any, Callable, Generator, cast, get_args

 import mlx.core as mx
 from mlx_lm.generate import stream_generate
-from mlx_lm.models.cache import KVCache
+from mlx_lm.models.cache import trim_prompt_cache
 from mlx_lm.sample_utils import make_sampler
 from mlx_lm.tokenizer_utils import TokenizerWrapper

-# from exo.engines.mlx.cache import KVPrefixCache
 from exo.shared.types.api import (
    BenchChatCompletionTaskParams,
    ChatCompletionMessage,
@@ -14,11 +14,13 @@ from exo.shared.types.api import (
    GenerationStats,
 )
 from exo.shared.types.memory import Memory
+from exo.shared.types.mlx import KVCacheType
 from exo.shared.types.tasks import ChatCompletionTaskParams
 from exo.shared.types.worker.runner_response import (
    GenerationResponse,
 )
 from exo.worker.engines.mlx import Model
+from exo.worker.engines.mlx.cache import KVPrefixCache, encode_prompt
 from exo.worker.engines.mlx.constants import KV_BITS, KV_GROUP_SIZE, MAX_TOKENS
 from exo.worker.engines.mlx.utils_mlx import (
    apply_chat_template,
@@ -29,20 +31,62 @@ from exo.worker.runner.bootstrap import logger

 generation_stream = mx.new_stream(mx.default_device())

+_MIN_PREFIX_HIT_TO_UPDATE = 1000

-def maybe_quantize_kv_cache(
-    prompt_cache: list[KVCache | Any],
-    quantized_kv_start: int,
-    kv_group_size: int,
-    kv_bits: int | None,
-) -> None:
-    if kv_bits is None:
-        return
-    for e, c in enumerate(prompt_cache):
-        if (
-            hasattr(c, "to_quantized") and c.offset >= quantized_kv_start  # type: ignore
-        ):
-            prompt_cache[e] = c.to_quantized(group_size=kv_group_size, bits=kv_bits)
+
+def prefill(
+    model: Model,
+    tokenizer: TokenizerWrapper,
+    sampler: Callable[[mx.array], mx.array],
+    prompt_tokens: mx.array,
+    cache: KVCacheType,
+) -> float:
+    """Prefill the KV cache with prompt tokens.
+
+    This runs the model over the prompt tokens to populate the cache,
+    then trims off the extra generated token.
+
+    Returns:
+        tokens_per_sec
+    """
+    num_tokens = len(prompt_tokens)
+    if num_tokens == 0:
+        return 0.0
+
+    logger.debug(f"Prefilling {num_tokens} tokens...")
+    start_time = time.perf_counter()
+
+    def progress_callback(processed: int, total: int) -> None:
+        elapsed = time.time() - start_time
+        tok_per_sec = processed / elapsed if elapsed > 0 else 0
+        logger.debug(
+            f"Prefill progress: {processed}/{total} tokens ({tok_per_sec:.1f} tok/s)"
+        )
+
+    # Use max_tokens=1 because max_tokens=0 does not work.
+    # We just throw away the generated token - we only care about filling the cache
+    for _ in stream_generate(
+        model=model,
+        tokenizer=tokenizer,
+        prompt=prompt_tokens,
+        max_tokens=1,
+        sampler=sampler,
+        prompt_cache=cache,
+        prefill_step_size=2048,
+        kv_group_size=KV_GROUP_SIZE,
+        kv_bits=KV_BITS,
+        prompt_progress_callback=progress_callback,
+    ):
+        break  # Stop after first iteration - cache is now filled
+    trim_prompt_cache(cast(list[Any], cache), 1)
+
+    elapsed = time.perf_counter() - start_time
+    tokens_per_sec = num_tokens / elapsed if elapsed > 0 else 0.0
+    logger.debug(
+        f"Prefill complete: {num_tokens} tokens in {elapsed:.2f}s "
+        f"({tokens_per_sec:.1f} tok/s)"
+    )
+    return tokens_per_sec


 def warmup_inference(
@@ -120,6 +164,7 @@ def mlx_generate(
    tokenizer: TokenizerWrapper,
    task: ChatCompletionTaskParams,
    prompt: str,
+    kv_prefix_cache: KVPrefixCache | None = None,
 ) -> Generator[GenerationResponse]:
    # Ensure that generation stats only contains peak memory for this generation
    mx.reset_peak_memory()
@@ -131,7 +176,22 @@ def mlx_generate(
    if task.seed is not None:
        mx.random.seed(task.seed)

-    caches = make_kv_cache(model=model)
+    # Do not use the prefix cache if we are trying to do benchmarks.
+    if is_bench:
+        kv_prefix_cache = None
+
+    # Use prefix cache if available, otherwise create fresh cache
+    prefix_hit_length = 0
+    matched_index: int | None = None
+    if kv_prefix_cache is None:
+        caches = make_kv_cache(model=model)
+        prompt_tokens = encode_prompt(tokenizer, prompt)
+    else:
+        caches, prompt_tokens, matched_index = kv_prefix_cache.get_kv_cache(
+            model, tokenizer, prompt
+        )
+        all_prompt_tokens = encode_prompt(tokenizer, prompt)
+        prefix_hit_length = len(all_prompt_tokens) - len(prompt_tokens)

    logits_processors: list[Callable[[mx.array, mx.array], mx.array]] = []
    if is_bench:
@@ -144,11 +204,19 @@ def mlx_generate(
        top_p=task.top_p if task.top_p is not None else 1.0,
    )

+    # Prefill cache with all tokens except the last one
+    prefill_tps = prefill(model, tokenizer, sampler, prompt_tokens[:-1], caches)
+
+    # stream_generate starts from the last token
+    last_token = prompt_tokens[-1:]
+
    max_tokens = task.max_tokens or MAX_TOKENS
+    generated_text_parts: list[str] = []
+    generation_start_time = time.perf_counter()
    for out in stream_generate(
        model=model,
        tokenizer=tokenizer,
-        prompt=prompt,
+        prompt=last_token,
        max_tokens=max_tokens,
        sampler=sampler,
        logits_processors=logits_processors,
@@ -158,12 +226,13 @@ def mlx_generate(
        kv_group_size=KV_GROUP_SIZE,
        kv_bits=KV_BITS,
    ):
+        generated_text_parts.append(out.text)
        logger.info(out.text)

        stats: GenerationStats | None = None
        if out.finish_reason is not None:
            stats = GenerationStats(
-                prompt_tps=float(out.prompt_tps),
+                prompt_tps=float(prefill_tps or out.prompt_tps),
                generation_tps=float(out.generation_tps),
                prompt_tokens=int(out.prompt_tokens),
                generation_tokens=int(out.generation_tokens),
@@ -185,6 +254,28 @@ def mlx_generate(
        )

        if out.finish_reason is not None:
+            # Log generation stats
+            generation_elapsed = time.perf_counter() - generation_start_time
+            generated_tokens = len(generated_text_parts)
+            generation_tps = (
+                generated_tokens / generation_elapsed if generation_elapsed > 0 else 0.0
+            )
+            logger.debug(
+                f"Generation complete: prefill {prompt_tokens} tokens @ "
+                f"{prefill_tps:.1f} tok/s, generated {generated_tokens} tokens @ "
+                f"{generation_tps:.1f} tok/s"
+            )
+            if kv_prefix_cache is not None:
+                full_prompt = prompt + "".join(generated_text_parts)
+                if (
+                    matched_index is not None
+                    and prefix_hit_length >= _MIN_PREFIX_HIT_TO_UPDATE
+                ):
+                    kv_prefix_cache.update_kv_cache(
+                        matched_index, tokenizer, full_prompt, caches
+                    )
+                else:
+                    kv_prefix_cache.add_kv_cache(tokenizer, full_prompt, caches)
            break

        # TODO: Do we want an mx_barrier?
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -41,7 +41,6 @@ import mlx.nn as nn
 from mlx_lm.utils import load_model
 from pydantic import RootModel

-from exo.download.download_utils import build_model_path
 from exo.shared.types.api import ChatCompletionMessageText
 from exo.shared.types.common import Host
 from exo.shared.types.memory import Memory
@@ -56,6 +55,7 @@ from exo.shared.types.worker.shards import (
    ShardMetadata,
    TensorShardMetadata,
 )
+from exo.worker.download.download_utils import build_model_path
 from exo.worker.engines.mlx import Model
 from exo.worker.engines.mlx.auto_parallel import (
    TimeoutCallback,
--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -1,9 +1,8 @@
 from datetime import datetime, timezone
 from random import random
-from typing import Iterator

 import anyio
-from anyio import CancelScope, create_task_group, fail_after
+from anyio import CancelScope, create_task_group, current_time, fail_after
 from anyio.abc import TaskGroup
 from loguru import logger

@@ -11,12 +10,7 @@ from exo.routing.connection_message import ConnectionMessage, ConnectionMessageT
 from exo.shared.apply import apply
 from exo.shared.models.model_cards import ModelId
 from exo.shared.types.api import ImageEditsInternalParams
-from exo.shared.types.commands import (
-    ForwarderCommand,
-    ForwarderDownloadCommand,
-    RequestEventLog,
-    StartDownload,
-)
+from exo.shared.types.commands import ForwarderCommand, RequestEventLog
 from exo.shared.types.common import CommandId, NodeId, SessionId
 from exo.shared.types.events import (
    Event,
@@ -24,6 +18,7 @@ from exo.shared.types.events import (
    ForwarderEvent,
    IndexedEvent,
    InputChunkReceived,
+    NodeDownloadProgress,
    NodeGatheredInfo,
    TaskCreated,
    TaskStatusUpdated,
@@ -41,12 +36,23 @@ from exo.shared.types.tasks import (
    TaskStatus,
 )
 from exo.shared.types.topology import Connection, SocketConnection
+from exo.shared.types.worker.downloads import (
+    DownloadCompleted,
+    DownloadFailed,
+    DownloadOngoing,
+    DownloadPending,
+    DownloadProgress,
+)
 from exo.shared.types.worker.runners import RunnerId
+from exo.shared.types.worker.shards import ShardMetadata
 from exo.utils.channels import Receiver, Sender, channel
 from exo.utils.event_buffer import OrderedBuffer
 from exo.utils.info_gatherer.info_gatherer import GatheredInfo, InfoGatherer
 from exo.utils.info_gatherer.net_profile import check_reachable
-from exo.utils.keyed_backoff import KeyedBackoff
+from exo.worker.download.download_utils import (
+    map_repo_download_progress_to_download_progress_data,
+)
+from exo.worker.download.shard_downloader import RepoDownloadProgress, ShardDownloader
 from exo.worker.plan import plan
 from exo.worker.runner.runner_supervisor import RunnerSupervisor

@@ -56,6 +62,7 @@ class Worker:
        self,
        node_id: NodeId,
        session_id: SessionId,
+        shard_downloader: ShardDownloader,
        *,
        connection_message_receiver: Receiver[ConnectionMessage],
        global_event_receiver: Receiver[ForwarderEvent],
@@ -63,22 +70,23 @@ class Worker:
        # This is for requesting updates. It doesn't need to be a general command sender right now,
        # but I think it's the correct way to be thinking about commands
        command_sender: Sender[ForwarderCommand],
-        download_command_sender: Sender[ForwarderDownloadCommand],
-        event_index_counter: Iterator[int],
    ):
        self.node_id: NodeId = node_id
        self.session_id: SessionId = session_id

+        self.shard_downloader: ShardDownloader = shard_downloader
+        self._pending_downloads: dict[RunnerId, ShardMetadata] = {}
+
        self.global_event_receiver = global_event_receiver
        self.local_event_sender = local_event_sender
-        self.event_index_counter = event_index_counter
+        self.local_event_index = 0
        self.command_sender = command_sender
-        self.download_command_sender = download_command_sender
        self.connection_message_receiver = connection_message_receiver
        self.event_buffer = OrderedBuffer[Event]()
        self.out_for_delivery: dict[EventId, ForwarderEvent] = {}

        self.state: State = State()
+        self.download_status: dict[ModelId, DownloadProgress] = {}
        self.runners: dict[RunnerId, RunnerSupervisor] = {}
        self._tg: TaskGroup = create_task_group()

@@ -93,8 +101,6 @@ class Worker:
        self.input_chunk_buffer: dict[CommandId, dict[int, str]] = {}
        self.input_chunk_counts: dict[CommandId, int] = {}

-        self._download_backoff: KeyedBackoff[ModelId] = KeyedBackoff(base=0.5, cap=10.0)
-
    async def run(self):
        logger.info("Starting Worker")

@@ -105,6 +111,7 @@ class Worker:
            tg.start_soon(info_gatherer.run)
            tg.start_soon(self._forward_info, info_recv)
            tg.start_soon(self.plan_step)
+            tg.start_soon(self._emit_existing_download_progress)
            tg.start_soon(self._connection_message_event_writer)
            tg.start_soon(self._resend_out_for_delivery)
            tg.start_soon(self._event_applier)
@@ -114,7 +121,6 @@ class Worker:
        # Actual shutdown code - waits for all tasks to complete before executing.
        self.local_event_sender.close()
        self.command_sender.close()
-        self.download_command_sender.close()
        for runner in self.runners.values():
            runner.shutdown()

@@ -173,9 +179,11 @@ class Worker:
    async def plan_step(self):
        while True:
            await anyio.sleep(0.1)
+            # 3. based on the updated state, we plan & execute an operation.
            task: Task | None = plan(
                self.node_id,
                self.runners,
+                self.download_status,
                self.state.downloads,
                self.state.instances,
                self.state.runners,
@@ -199,26 +207,42 @@ class Worker:
                        )
                    )
                case DownloadModel(shard_metadata=shard):
-                    model_id = shard.model_card.model_id
-                    if not self._download_backoff.should_proceed(model_id):
-                        continue
-
-                    self._download_backoff.record_attempt(model_id)
-
-                    await self.download_command_sender.send(
-                        ForwarderDownloadCommand(
-                            origin=self.node_id,
-                            command=StartDownload(
-                                target_node_id=self.node_id,
-                                shard_metadata=shard,
-                            ),
+                    if shard.model_card.model_id not in self.download_status:
+                        progress = DownloadPending(
+                            shard_metadata=shard, node_id=self.node_id
+                        )
+                        self.download_status[shard.model_card.model_id] = progress
+                        await self.event_sender.send(
+                            NodeDownloadProgress(download_progress=progress)
+                        )
+                    initial_progress = (
+                        await self.shard_downloader.get_shard_download_status_for_shard(
+                            shard
                        )
                    )
-                    await self.event_sender.send(
-                        TaskStatusUpdated(
-                            task_id=task.task_id, task_status=TaskStatus.Running
+                    if initial_progress.status == "complete":
+                        progress = DownloadCompleted(
+                            shard_metadata=shard,
+                            node_id=self.node_id,
+                            total_bytes=initial_progress.total_bytes,
                        )
-                    )
+                        self.download_status[shard.model_card.model_id] = progress
+                        await self.event_sender.send(
+                            NodeDownloadProgress(download_progress=progress)
+                        )
+                        await self.event_sender.send(
+                            TaskStatusUpdated(
+                                task_id=task.task_id,
+                                task_status=TaskStatus.Complete,
+                            )
+                        )
+                    else:
+                        await self.event_sender.send(
+                            TaskStatusUpdated(
+                                task_id=task.task_id, task_status=TaskStatus.Running
+                            )
+                        )
+                        self._handle_shard_download_process(task, initial_progress)
                case Shutdown(runner_id=runner_id):
                    try:
                        with fail_after(3):
@@ -363,17 +387,104 @@ class Worker:
        self._tg.start_soon(runner.run)
        return runner

+    def _handle_shard_download_process(
+        self,
+        task: DownloadModel,
+        initial_progress: RepoDownloadProgress,
+    ):
+        """Manages the shard download process with progress tracking."""
+        status = DownloadOngoing(
+            node_id=self.node_id,
+            shard_metadata=task.shard_metadata,
+            download_progress=map_repo_download_progress_to_download_progress_data(
+                initial_progress
+            ),
+        )
+        self.download_status[task.shard_metadata.model_card.model_id] = status
+        self.event_sender.send_nowait(NodeDownloadProgress(download_progress=status))
+
+        last_progress_time = 0.0
+        throttle_interval_secs = 1.0
+
+        async def download_progress_callback(
+            shard: ShardMetadata, progress: RepoDownloadProgress
+        ) -> None:
+            nonlocal self
+            nonlocal last_progress_time
+            if progress.status == "complete":
+                status = DownloadCompleted(
+                    shard_metadata=shard,
+                    node_id=self.node_id,
+                    total_bytes=progress.total_bytes,
+                )
+                self.download_status[shard.model_card.model_id] = status
+                await self.event_sender.send(
+                    NodeDownloadProgress(download_progress=status)
+                )
+                await self.event_sender.send(
+                    TaskStatusUpdated(
+                        task_id=task.task_id, task_status=TaskStatus.Complete
+                    )
+                )
+            elif (
+                progress.status == "in_progress"
+                and current_time() - last_progress_time > throttle_interval_secs
+            ):
+                status = DownloadOngoing(
+                    node_id=self.node_id,
+                    shard_metadata=shard,
+                    download_progress=map_repo_download_progress_to_download_progress_data(
+                        progress
+                    ),
+                )
+                self.download_status[shard.model_card.model_id] = status
+                await self.event_sender.send(
+                    NodeDownloadProgress(download_progress=status)
+                )
+                last_progress_time = current_time()
+
+        self.shard_downloader.on_progress(download_progress_callback)
+
+        async def download_with_error_handling() -> None:
+            try:
+                await self.shard_downloader.ensure_shard(task.shard_metadata)
+            except Exception as e:
+                error_message = str(e)
+                logger.error(
+                    f"Download failed for {task.shard_metadata.model_card.model_id}: {error_message}"
+                )
+                failed_status = DownloadFailed(
+                    node_id=self.node_id,
+                    shard_metadata=task.shard_metadata,
+                    error_message=error_message,
+                )
+                self.download_status[task.shard_metadata.model_card.model_id] = (
+                    failed_status
+                )
+                await self.event_sender.send(
+                    NodeDownloadProgress(download_progress=failed_status)
+                )
+                await self.event_sender.send(
+                    TaskStatusUpdated(
+                        task_id=task.task_id, task_status=TaskStatus.Failed
+                    )
+                )
+
+        self._tg.start_soon(download_with_error_handling)
+
    async def _forward_events(self) -> None:
        with self.event_receiver as events:
            async for event in events:
-                idx = next(self.event_index_counter)
                fe = ForwarderEvent(
-                    origin_idx=idx,
+                    origin_idx=self.local_event_index,
                    origin=self.node_id,
                    session=self.session_id,
                    event=event,
                )
-                logger.debug(f"Worker published event {idx}: {str(event)[:100]}")
+                logger.debug(
+                    f"Worker published event {self.local_event_index}: {str(event)[:100]}"
+                )
+                self.local_event_index += 1
                await self.local_event_sender.send(fe)
                self.out_for_delivery[event.event_id] = fe

@@ -421,3 +532,42 @@ class Worker:
                    await self.event_sender.send(TopologyEdgeDeleted(conn=conn))

            await anyio.sleep(10)
+
+    async def _emit_existing_download_progress(self) -> None:
+        try:
+            while True:
+                logger.debug("Fetching and emitting existing download progress...")
+                async for (
+                    _,
+                    progress,
+                ) in self.shard_downloader.get_shard_download_status():
+                    if progress.status == "complete":
+                        status = DownloadCompleted(
+                            node_id=self.node_id,
+                            shard_metadata=progress.shard,
+                            total_bytes=progress.total_bytes,
+                        )
+                    elif progress.status in ["in_progress", "not_started"]:
+                        if progress.downloaded_bytes_this_session.in_bytes == 0:
+                            status = DownloadPending(
+                                node_id=self.node_id, shard_metadata=progress.shard
+                            )
+                        else:
+                            status = DownloadOngoing(
+                                node_id=self.node_id,
+                                shard_metadata=progress.shard,
+                                download_progress=map_repo_download_progress_to_download_progress_data(
+                                    progress
+                                ),
+                            )
+                    else:
+                        continue
+
+                    self.download_status[progress.shard.model_card.model_id] = status
+                    await self.event_sender.send(
+                        NodeDownloadProgress(download_progress=status)
+                    )
+                logger.debug("Done emitting existing download progress.")
+                await anyio.sleep(5 * 60)  # 5 minutes
+        except Exception as e:
+            logger.error(f"Error emitting existing download progress: {e}")
--- a/src/exo/worker/plan.py
+++ b/src/exo/worker/plan.py
@@ -2,6 +2,7 @@

 from collections.abc import Mapping, Sequence

+from exo.shared.models.model_cards import ModelId
 from exo.shared.types.common import CommandId, NodeId
 from exo.shared.types.tasks import (
    ChatCompletion,
@@ -44,6 +45,9 @@ def plan(
    node_id: NodeId,
    # Runners is expected to be FRESH and so should not come from state
    runners: Mapping[RunnerId, RunnerSupervisor],
+    # DL_status is expected to be FRESH and so should not come from state
+    download_status: Mapping[ModelId, DownloadProgress],
+    # gdls is not expected to be fresh
    global_download_status: Mapping[NodeId, Sequence[DownloadProgress]],
    instances: Mapping[InstanceId, Instance],
    all_runners: Mapping[RunnerId, RunnerStatus],  # all global
@@ -55,7 +59,7 @@ def plan(
    return (
        _kill_runner(runners, all_runners, instances)
        or _create_runner(node_id, runners, instances)
-        or _model_needs_download(node_id, runners, global_download_status)
+        or _model_needs_download(runners, download_status)
        or _init_distributed_backend(runners, all_runners)
        or _load_model(runners, all_runners, global_download_status)
        or _ready_to_warmup(runners, all_runners)
@@ -111,15 +115,9 @@ def _create_runner(


 def _model_needs_download(
-    node_id: NodeId,
    runners: Mapping[RunnerId, RunnerSupervisor],
-    global_download_status: Mapping[NodeId, Sequence[DownloadProgress]],
+    download_status: Mapping[ModelId, DownloadProgress],
 ) -> DownloadModel | None:
-    local_downloads = global_download_status.get(node_id, [])
-    download_status = {
-        dp.shard_metadata.model_card.model_id: dp for dp in local_downloads
-    }
-
    for runner in runners.values():
        model_id = runner.bound_instance.bound_shard.model_card.model_id
        if isinstance(runner.status, RunnerIdle) and (
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -70,6 +70,7 @@ from exo.worker.engines.image import (
    warmup_image_generator,
 )
 from exo.worker.engines.mlx import Model
+from exo.worker.engines.mlx.cache import KVPrefixCache
 from exo.worker.engines.mlx.generator.generate import mlx_generate, warmup_inference
 from exo.worker.engines.mlx.utils_mlx import (
    apply_chat_template,
@@ -103,6 +104,7 @@ def main(
    model: Model | DistributedImageModel | None = None
    tokenizer = None
    group = None
+    kv_prefix_cache: KVPrefixCache | None = None

    current_status: RunnerStatus = RunnerIdle()
    logger.info("runner created")
@@ -171,6 +173,9 @@ def main(
                            f"Unknown model task(s): {shard_metadata.model_card.tasks}"
                        )

+                    if ModelTask.TextGeneration in shard_metadata.model_card.tasks:
+                        kv_prefix_cache = KVPrefixCache()
+
                    current_status = RunnerLoaded()
                    logger.info("runner loaded")
                case StartWarmup() if isinstance(current_status, RunnerLoaded):
@@ -238,6 +243,7 @@ def main(
                            tokenizer=tokenizer,
                            task=task_params,
                            prompt=prompt,
+                            kv_prefix_cache=kv_prefix_cache,
                        )

                        # GPT-OSS specific parsing to match other model formats.
--- a/src/exo/worker/tests/unittests/test_mlx/test_kv_prefix_cache.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_kv_prefix_cache.py
@@ -0,0 +1,537 @@
+# type: ignore
+import time
+from typing import cast
+from unittest.mock import patch
+
+import mlx.core as mx
+import pytest
+from mlx_lm.models.cache import KVCache
+from mlx_lm.sample_utils import make_sampler
+
+from exo.shared.types.api import ChatCompletionMessage
+from exo.shared.types.common import ModelId
+from exo.shared.types.tasks import ChatCompletionTaskParams
+from exo.worker.engines.mlx import Model
+from exo.worker.engines.mlx.cache import (
+    KVPrefixCache,
+    _cache_length,
+    _get_prefix_length,
+    encode_prompt,
+)
+from exo.worker.engines.mlx.generator.generate import mlx_generate, prefill
+from exo.worker.engines.mlx.utils_mlx import apply_chat_template, make_kv_cache
+from exo.worker.tests.unittests.test_mlx.conftest import (
+    DEFAULT_GPT_OSS_CONFIG,
+    DEFAULT_GPT_OSS_MODEL_ID,
+)
+
+
+def _check_model_exists() -> bool:
+    return DEFAULT_GPT_OSS_CONFIG.model_path.exists()
+
+
+class TestGetPrefixLength:
+    def test_identical_arrays(self):
+        a = mx.array([1, 2, 3, 4, 5])
+        b = mx.array([1, 2, 3, 4, 5])
+        assert _get_prefix_length(a, b) == 5
+
+    def test_no_common_prefix(self):
+        a = mx.array([1, 2, 3])
+        b = mx.array([4, 5, 6])
+        assert _get_prefix_length(a, b) == 0
+
+    def test_partial_prefix(self):
+        a = mx.array([1, 2, 3, 4, 5])
+        b = mx.array([1, 2, 3, 7, 8])
+        assert _get_prefix_length(a, b) == 3
+
+    def test_prompt_longer_than_cached(self):
+        a = mx.array([1, 2, 3, 4, 5])
+        b = mx.array([1, 2, 3])
+        assert _get_prefix_length(a, b) == 3
+
+    def test_cached_longer_than_prompt(self):
+        a = mx.array([1, 2, 3])
+        b = mx.array([1, 2, 3, 4, 5])
+        assert _get_prefix_length(a, b) == 3
+
+    def test_single_token_match(self):
+        a = mx.array([1, 2, 3])
+        b = mx.array([1, 5, 6])
+        assert _get_prefix_length(a, b) == 1
+
+    def test_empty_prompt(self):
+        a = mx.array([]).astype(mx.int32)
+        b = mx.array([1, 2, 3])
+        assert _get_prefix_length(a, b) == 0
+
+    def test_empty_cached(self):
+        a = mx.array([1, 2, 3])
+        b = mx.array([]).astype(mx.int32)
+        assert _get_prefix_length(a, b) == 0
+
+    def test_both_empty(self):
+        a = mx.array([]).astype(mx.int32)
+        b = mx.array([]).astype(mx.int32)
+        assert _get_prefix_length(a, b) == 0
+
+
+class TestKVPrefix:
+    def test_starts_empty(self):
+        cache = KVPrefixCache()
+        assert len(cache.prompts) == 0
+        assert len(cache.caches) == 0
+
+    def test_clear_empties_cache(self):
+        cache = KVPrefixCache()
+        cache.prompts.append(mx.array([1, 2, 3]))
+        cache.caches.append([KVCache()])
+        cache.clear()
+        assert len(cache.prompts) == 0
+        assert len(cache.caches) == 0
+
+    def test_clear_on_empty_cache(self):
+        cache = KVPrefixCache()
+        cache.clear()
+        assert len(cache.prompts) == 0
+
+
+def _load_gpt_oss() -> tuple[Model, object]:
+    from mlx_lm.utils import load_model
+
+    from exo.worker.engines.mlx.utils_mlx import load_tokenizer_for_model_id
+
+    model_path = DEFAULT_GPT_OSS_CONFIG.model_path
+    model_id = ModelId(DEFAULT_GPT_OSS_MODEL_ID)
+
+    model, _ = load_model(model_path, lazy=False)
+    tokenizer = load_tokenizer_for_model_id(model_id, model_path)
+    return cast(Model, model), tokenizer
+
+
+@pytest.mark.slow
+@pytest.mark.skipif(
+    not _check_model_exists(),
+    reason=f"GPT-OSS model not found at {DEFAULT_GPT_OSS_CONFIG.model_path}",
+)
+class TestKVPrefixCacheWithModel:
+    @pytest.fixture(scope="class")
+    def model_and_tokenizer(self):
+        model, tokenizer = _load_gpt_oss()
+        return model, tokenizer
+
+    def test_prefill_populates_cache(self, model_and_tokenizer):
+        model, tokenizer = model_and_tokenizer
+
+        task = ChatCompletionTaskParams(
+            model=DEFAULT_GPT_OSS_MODEL_ID,
+            messages=[ChatCompletionMessage(role="user", content="Hello!!")],
+            max_tokens=1,
+        )
+        prompt = apply_chat_template(tokenizer, task)
+        tokens = encode_prompt(tokenizer, prompt)
+        cache = make_kv_cache(model)
+
+        prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
+
+        # Cache should now hold the prompt tokens
+        assert _cache_length(cache) == len(tokens)
+
+    def test_add_and_get_exact_match(self, model_and_tokenizer):
+        model, tokenizer = model_and_tokenizer
+
+        task = ChatCompletionTaskParams(
+            model=DEFAULT_GPT_OSS_MODEL_ID,
+            messages=[ChatCompletionMessage(role="user", content="Test exact")],
+            max_tokens=1,
+        )
+        prompt = apply_chat_template(tokenizer, task)
+        tokens = encode_prompt(tokenizer, prompt)
+        cache = make_kv_cache(model)
+
+        prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
+
+        kv_prefix_cache = KVPrefixCache()
+        kv_prefix_cache.add_kv_cache(tokenizer, prompt, cache)
+
+        assert len(kv_prefix_cache.prompts) == 1
+        stored_length = _cache_length(kv_prefix_cache.caches[0])
+        assert stored_length > 0
+
+        # Retrieve with same prompt: exact match
+        result_cache, remaining_tokens, matched_index = kv_prefix_cache.get_kv_cache(
+            model, tokenizer, prompt
+        )
+        assert matched_index == 0
+
+        # Exact match returns only last token
+        assert len(remaining_tokens) == 1
+        assert mx.array_equal(remaining_tokens, tokens[-1:])
+
+    def test_add_and_get_prefix_match(self, model_and_tokenizer):
+        """get_kv_cache with a longer prompt sharing prefix should return partial match."""
+        model, tokenizer = model_and_tokenizer
+
+        short_task = ChatCompletionTaskParams(
+            model=DEFAULT_GPT_OSS_MODEL_ID,
+            messages=[ChatCompletionMessage(role="user", content="Hi")],
+            max_tokens=1,
+        )
+        short_prompt = apply_chat_template(tokenizer, short_task)
+        short_tokens = encode_prompt(tokenizer, short_prompt)
+        cache = make_kv_cache(model)
+
+        prefill(model, tokenizer, make_sampler(0.0), short_tokens, cache)
+
+        kv_prefix_cache = KVPrefixCache()
+        kv_prefix_cache.add_kv_cache(tokenizer, short_prompt, cache)
+
+        # Query with longer prompt that shares the chat template prefix
+        long_task = ChatCompletionTaskParams(
+            model=DEFAULT_GPT_OSS_MODEL_ID,
+            messages=[
+                ChatCompletionMessage(role="user", content="Hi there, how are you?")
+            ],
+            max_tokens=1,
+        )
+        long_prompt = apply_chat_template(tokenizer, long_task)
+        long_tokens = encode_prompt(tokenizer, long_prompt)
+
+        # The prompts share a prefix (chat template preamble + "Hi")
+        expected_prefix = _get_prefix_length(long_tokens, short_tokens)
+        assert expected_prefix > 0, (
+            "Prompts should share a prefix from the chat template"
+        )
+
+        result_cache, remaining_tokens, matched_index = kv_prefix_cache.get_kv_cache(
+            model, tokenizer, long_prompt
+        )
+        assert matched_index == 0
+
+        # remaining_tokens should be the suffix after the shared prefix
+        assert len(remaining_tokens) == len(long_tokens) - expected_prefix
+        assert mx.array_equal(remaining_tokens, long_tokens[expected_prefix:])
+
+    def test_stored_cache_not_mutated_after_get_and_generation(
+        self, model_and_tokenizer
+    ):
+        """Getting a cache and then mutating it (as generation does) must not corrupt stored cache."""
+        model, tokenizer = model_and_tokenizer
+
+        task = ChatCompletionTaskParams(
+            model=DEFAULT_GPT_OSS_MODEL_ID,
+            messages=[ChatCompletionMessage(role="user", content="Mutation test")],
+            max_tokens=1,
+        )
+        prompt = apply_chat_template(tokenizer, task)
+        tokens = encode_prompt(tokenizer, prompt)
+        cache = make_kv_cache(model)
+
+        prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
+
+        kv_prefix_cache = KVPrefixCache()
+        kv_prefix_cache.add_kv_cache(tokenizer, prompt, cache)
+
+        stored_length = _cache_length(kv_prefix_cache.caches[0])
+
+        # Get cache and mutate it (simulating what generation does)
+        result_cache, _, matched_index = kv_prefix_cache.get_kv_cache(
+            model, tokenizer, prompt
+        )
+        assert matched_index == 0
+
+        # Simulate generation: feed many additional tokens through the cache
+        head_dim = result_cache[0].keys.shape[-1]
+        num_heads = result_cache[0].keys.shape[1]
+        extra_keys = mx.random.normal((1, num_heads, 50, head_dim))
+        extra_values = mx.random.normal((1, num_heads, 50, head_dim))
+        for layer_cache in result_cache:
+            layer_cache.update_and_fetch(extra_keys, extra_values)
+        mx.eval([c.keys for c in result_cache])
+
+        # Stored cache must be unchanged
+        assert _cache_length(kv_prefix_cache.caches[0]) == stored_length
+
+    def test_stored_cache_survives_repeated_get_mutate_cycles(
+        self, model_and_tokenizer
+    ):
+        """Multiple get+mutate cycles (like repeated user requests) must not corrupt cache."""
+        model, tokenizer = model_and_tokenizer
+
+        task = ChatCompletionTaskParams(
+            model=DEFAULT_GPT_OSS_MODEL_ID,
+            messages=[ChatCompletionMessage(role="user", content="Repeat test")],
+            max_tokens=1,
+        )
+        prompt = apply_chat_template(tokenizer, task)
+        tokens = encode_prompt(tokenizer, prompt)
+        cache = make_kv_cache(model)
+
+        prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
+
+        kv_prefix_cache = KVPrefixCache()
+        kv_prefix_cache.add_kv_cache(tokenizer, prompt, cache)
+
+        stored_length = _cache_length(kv_prefix_cache.caches[0])
+
+        for i in range(3):
+            result_cache, _, _ = kv_prefix_cache.get_kv_cache(model, tokenizer, prompt)
+
+            head_dim = result_cache[0].keys.shape[-1]
+            num_heads = result_cache[0].keys.shape[1]
+            extra = mx.random.normal((1, num_heads, 30, head_dim))
+            for layer_cache in result_cache:
+                layer_cache.update_and_fetch(extra, extra)
+            mx.eval([c.keys for c in result_cache])
+
+            assert _cache_length(kv_prefix_cache.caches[0]) == stored_length, (
+                f"Failed on loop {i}"
+            )
+
+    def test_mlx_generate_populates_cache(self, model_and_tokenizer):
+        """mlx_generate should save the cache after generation completes."""
+        model, tokenizer = model_and_tokenizer
+
+        kv_prefix_cache = KVPrefixCache()
+        task = ChatCompletionTaskParams(
+            model=DEFAULT_GPT_OSS_MODEL_ID,
+            messages=[ChatCompletionMessage(role="user", content="Hello")],
+            max_tokens=5,
+        )
+        prompt = apply_chat_template(tokenizer, task)
+        prompt_tokens = encode_prompt(tokenizer, prompt)
+
+        # Consume the entire generator so the cache-saving code after yield runs
+        generated_tokens = 0
+        for _response in mlx_generate(
+            model=model,
+            tokenizer=tokenizer,
+            task=task,
+            prompt=prompt,
+            kv_prefix_cache=kv_prefix_cache,
+        ):
+            generated_tokens += 1
+
+        assert len(kv_prefix_cache.prompts) == 1
+        assert len(kv_prefix_cache.caches) == 1
+        # Cache should contain prompt + generated tokens
+        expected_length = len(prompt_tokens) + generated_tokens
+        assert _cache_length(kv_prefix_cache.caches[0]) == expected_length
+
+    def test_mlx_generate_second_call_gets_prefix_hit(self, model_and_tokenizer):
+        """Second mlx_generate call with same prompt should get a prefix hit from stored cache."""
+        model, tokenizer = model_and_tokenizer
+
+        kv_prefix_cache = KVPrefixCache()
+        task = ChatCompletionTaskParams(
+            model=DEFAULT_GPT_OSS_MODEL_ID,
+            messages=[ChatCompletionMessage(role="user", content="Reuse test")],
+            max_tokens=5,
+        )
+        prompt = apply_chat_template(tokenizer, task)
+        prompt_tokens = encode_prompt(tokenizer, prompt)
+
+        # First generation populates cache
+        for _response in mlx_generate(
+            model=model,
+            tokenizer=tokenizer,
+            task=task,
+            prompt=prompt,
+            kv_prefix_cache=kv_prefix_cache,
+        ):
+            pass
+
+        assert len(kv_prefix_cache.prompts) == 1
+
+        # Second call should find a prefix match (the stored cache contains
+        # prompt + generated tokens, which shares the prompt prefix)
+        result_cache, remaining_tokens, matched_index = kv_prefix_cache.get_kv_cache(
+            model, tokenizer, prompt
+        )
+        # The stored cache is longer than the prompt (it includes generated tokens),
+        # so this is a prefix match where our prompt is fully contained
+        assert matched_index == 0
+        # Exact match: remaining_tokens is just the last token
+        assert len(remaining_tokens) == 1
+        assert mx.array_equal(remaining_tokens, prompt_tokens[-1:])
+
+    def test_mlx_generate_long_prompt_updates_cache_in_place(self, model_and_tokenizer):
+        """With a prompt > 1000 tokens, second generation should update the cache entry in-place."""
+        model, tokenizer = model_and_tokenizer
+
+        kv_prefix_cache = KVPrefixCache()
+
+        # Build a long user message (> 1000 tokens) to exceed _MIN_PREFIX_HIT_TO_UPDATE
+        base_text = "The quick brown fox jumps over the lazy dog. "
+        base_tokens = tokenizer.encode(base_text)
+        repeats = (1200 // len(base_tokens)) + 2
+        long_content = base_text * repeats
+
+        task1 = ChatCompletionTaskParams(
+            model=DEFAULT_GPT_OSS_MODEL_ID,
+            messages=[ChatCompletionMessage(role="user", content=long_content)],
+            max_tokens=5,
+        )
+        prompt1 = apply_chat_template(tokenizer, task1)
+        prompt1_tokens = encode_prompt(tokenizer, prompt1)
+        assert len(prompt1_tokens) > 1000, (
+            "Prompt must exceed _MIN_PREFIX_HIT_TO_UPDATE"
+        )
+
+        # First generation populates the cache (must prefill all tokens)
+        t0 = time.perf_counter()
+        for _response in mlx_generate(
+            model=model,
+            tokenizer=tokenizer,
+            task=task1,
+            prompt=prompt1,
+            kv_prefix_cache=kv_prefix_cache,
+        ):
+            pass
+        first_gen_time = time.perf_counter() - t0
+
+        assert len(kv_prefix_cache.prompts) == 1
+        first_cache_length = _cache_length(kv_prefix_cache.caches[0])
+
+        # Second generation: same long prompt + extra content (simulating multi-turn)
+        task2 = ChatCompletionTaskParams(
+            model=DEFAULT_GPT_OSS_MODEL_ID,
+            messages=[
+                ChatCompletionMessage(role="user", content=long_content),
+                ChatCompletionMessage(role="assistant", content="Sure, I can help."),
+                ChatCompletionMessage(role="user", content="Tell me more."),
+            ],
+            max_tokens=5,
+        )
+        prompt2 = apply_chat_template(tokenizer, task2)
+        prompt2_tokens = encode_prompt(tokenizer, prompt2)
+
+        # Verify the prompts share a long prefix
+        prefix_len = _get_prefix_length(prompt2_tokens, prompt1_tokens)
+        assert prefix_len > 1000, "Prompts must share > 1000 token prefix"
+
+        # Second generation should reuse the cached prefix (only prefill new tokens)
+        t0 = time.perf_counter()
+        for _response in mlx_generate(
+            model=model,
+            tokenizer=tokenizer,
+            task=task2,
+            prompt=prompt2,
+            kv_prefix_cache=kv_prefix_cache,
+        ):
+            pass
+        second_gen_time = time.perf_counter() - t0
+
+        # Second generation should be significantly faster due to prefix cache hit - hopefully not flaky
+        assert second_gen_time < first_gen_time * 0.5, (
+            f"Expected prefix cache speedup: "
+            f"first={first_gen_time:.2f}s, second={second_gen_time:.2f}s"
+        )
+
+        # With prefix_hit > 1000, should update in-place (not add a second entry)
+        assert len(kv_prefix_cache.prompts) == 1
+        # Updated cache should be longer (prompt2 + generated > prompt1 + generated)
+        updated_cache_length = _cache_length(kv_prefix_cache.caches[0])
+        assert updated_cache_length > first_cache_length
+
+    def test_mlx_generate_stored_cache_not_mutated(self, model_and_tokenizer):
+        """After mlx_generate saves a cache, a second generation must not corrupt the stored copy."""
+        model, tokenizer = model_and_tokenizer
+
+        kv_prefix_cache = KVPrefixCache()
+        task = ChatCompletionTaskParams(
+            model=DEFAULT_GPT_OSS_MODEL_ID,
+            messages=[ChatCompletionMessage(role="user", content="Immutable test")],
+            max_tokens=5,
+        )
+        prompt = apply_chat_template(tokenizer, task)
+
+        # First generation populates cache
+        for _response in mlx_generate(
+            model=model,
+            tokenizer=tokenizer,
+            task=task,
+            prompt=prompt,
+            kv_prefix_cache=kv_prefix_cache,
+        ):
+            pass
+
+        first_cache_length = _cache_length(kv_prefix_cache.caches[0])
+
+        # Second generation gets the cache and mutates it during generation
+        for _response in mlx_generate(
+            model=model,
+            tokenizer=tokenizer,
+            task=task,
+            prompt=prompt,
+            kv_prefix_cache=kv_prefix_cache,
+        ):
+            pass
+
+        # The first stored cache must not have been mutated by the second generation
+        assert _cache_length(kv_prefix_cache.caches[0]) == first_cache_length
+
+    def test_evicts_lru_entry_under_memory_pressure(self, model_and_tokenizer):
+        """Under memory pressure, adding a new cache entry evicts the least recently used one."""
+        model, tokenizer = model_and_tokenizer
+
+        kv_prefix_cache = KVPrefixCache()
+
+        # Add three cache entries with different prompts
+        prompts = ["First entry", "Second entry", "Third entry"]
+        for i, content in enumerate(prompts):
+            task = ChatCompletionTaskParams(
+                model=DEFAULT_GPT_OSS_MODEL_ID,
+                messages=[ChatCompletionMessage(role="user", content=content)],
+                max_tokens=1,
+            )
+            prompt = apply_chat_template(tokenizer, task)
+            tokens = encode_prompt(tokenizer, prompt)
+            cache = make_kv_cache(model)
+            prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
+            kv_prefix_cache.add_kv_cache(tokenizer, prompt, cache)
+            # Stagger _last_used so LRU order is deterministic
+            kv_prefix_cache._last_used[i] = float(i)
+
+        assert len(kv_prefix_cache.prompts) == 3
+
+        # Access the third entry to make it most recently used
+        kv_prefix_cache._last_used[2] = 100.0
+        # Entry 0 (_last_used=0.0) is LRU, entry 1 (_last_used=1.0) is next
+
+        # Simulate memory pressure: active memory exceeds threshold
+        fake_limit = 1000
+        fake_active = int(fake_limit * 0.90)  # Above _MEMORY_PRESSURE_THRESHOLD (0.85)
+
+        with (
+            patch(
+                "exo.worker.engines.mlx.cache.mx.metal.get_active_memory",
+                return_value=fake_active,
+            ),
+            patch(
+                "exo.worker.engines.mlx.cache.mx.metal.device_info",
+                return_value={"max_recommended_working_set_size": fake_limit},
+            ),
+        ):
+            # Trigger eviction by adding a new entry
+            task = ChatCompletionTaskParams(
+                model=DEFAULT_GPT_OSS_MODEL_ID,
+                messages=[ChatCompletionMessage(role="user", content="New entry")],
+                max_tokens=1,
+            )
+            prompt = apply_chat_template(tokenizer, task)
+            tokens = encode_prompt(tokenizer, prompt)
+            cache = make_kv_cache(model)
+            prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
+            kv_prefix_cache.add_kv_cache(tokenizer, prompt, cache)
+
+        # LRU entries should have been evicted (entries 0, 1, 2 in order of _last_used)
+        # Since fake_active stays above threshold after each eviction (we don't change it),
+        # all old entries get evicted, leaving only the newly added one
+        assert len(kv_prefix_cache.prompts) == 1
+        # The surviving entry should be the newly added one
+        new_tokens = encode_prompt(tokenizer, prompt)
+        assert _get_prefix_length(kv_prefix_cache.prompts[0], new_tokens) == len(
+            new_tokens
+        )
--- a/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
@@ -11,12 +11,12 @@ from pathlib import Path

 import pytest

-from exo.download.download_utils import (
+from exo.shared.models.model_cards import MODEL_CARDS, ModelCard, ModelId
+from exo.worker.download.download_utils import (
    download_file_with_retry,
    ensure_models_dir,
    fetch_file_list_with_cache,
 )
-from exo.shared.models.model_cards import MODEL_CARDS, ModelCard, ModelId
 from exo.worker.engines.mlx.utils_mlx import (
    get_eos_token_ids_for_model,
    load_tokenizer_for_model_id,
--- a/src/exo/worker/tests/unittests/test_plan/test_download_and_loading.py
+++ b/src/exo/worker/tests/unittests/test_plan/test_download_and_loading.py
@@ -1,5 +1,5 @@
 import exo.worker.plan as plan_mod
-from exo.shared.types.common import NodeId
+from exo.shared.types.common import ModelId, NodeId
 from exo.shared.types.memory import Memory
 from exo.shared.types.tasks import LoadModel
 from exo.shared.types.worker.downloads import DownloadCompleted, DownloadProgress
@@ -45,9 +45,13 @@ def test_plan_requests_download_when_waiting_and_shard_not_downloaded():
    instances = {INSTANCE_1_ID: instance}
    all_runners = {RUNNER_1_ID: RunnerIdle()}

+    # No entry for this shard -> should trigger DownloadModel
+    download_status: dict[ModelId, DownloadProgress] = {}
+
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status=download_status,
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
@@ -88,6 +92,14 @@ def test_plan_loads_model_when_all_shards_downloaded_and_waiting():
        RUNNER_2_ID: RunnerConnected(),
    }

+    # Local node has already marked its shard as downloaded (not actually used by _load_model)
+    local_download_status = {
+        MODEL_A_ID: DownloadCompleted(
+            shard_metadata=shard1, node_id=NODE_A, total_bytes=Memory()
+        )
+    }
+
+    # Global view has completed downloads for both nodes
    global_download_status = {
        NODE_A: [
            DownloadCompleted(
@@ -104,6 +116,7 @@ def test_plan_loads_model_when_all_shards_downloaded_and_waiting():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status=local_download_status,
        global_download_status=global_download_status,
        instances=instances,
        all_runners=all_runners,
@@ -135,19 +148,23 @@ def test_plan_does_not_request_download_when_shard_already_downloaded():
    instances = {INSTANCE_1_ID: instance}
    all_runners = {RUNNER_1_ID: RunnerIdle()}

-    # Global state shows shard is downloaded for NODE_A
+    # Local status claims the shard is downloaded already
+    local_download_status = {
+        MODEL_A_ID: DownloadCompleted(
+            shard_metadata=shard, node_id=NODE_A, total_bytes=Memory()
+        )
+    }
+
+    # Global view hasn't caught up yet (no completed shards recorded for NODE_A)
    global_download_status: dict[NodeId, list[DownloadProgress]] = {
-        NODE_A: [
-            DownloadCompleted(
-                shard_metadata=shard, node_id=NODE_A, total_bytes=Memory()
-            )
-        ],
+        NODE_A: [],
        NODE_B: [],
    }

    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status=local_download_status,
        global_download_status=global_download_status,
        instances=instances,
        all_runners=all_runners,
@@ -185,6 +202,12 @@ def test_plan_does_not_load_model_until_all_shards_downloaded_globally():
        RUNNER_2_ID: RunnerConnected(),
    }

+    # Only NODE_A's shard is recorded as downloaded globally
+    local_download_status = {
+        MODEL_A_ID: DownloadCompleted(
+            shard_metadata=shard1, node_id=NODE_A, total_bytes=Memory()
+        )
+    }
    global_download_status = {
        NODE_A: [
            DownloadCompleted(
@@ -197,6 +220,7 @@ def test_plan_does_not_load_model_until_all_shards_downloaded_globally():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status=local_download_status,
        global_download_status=global_download_status,
        instances=instances,
        all_runners=all_runners,
@@ -221,6 +245,7 @@ def test_plan_does_not_load_model_until_all_shards_downloaded_globally():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status=local_download_status,
        global_download_status=global_download_status,
        instances=instances,
        all_runners=all_runners,
--- a/src/exo/worker/tests/unittests/test_plan/test_runner_lifecycle.py
+++ b/src/exo/worker/tests/unittests/test_plan/test_runner_lifecycle.py
@@ -47,7 +47,8 @@ def test_plan_kills_runner_when_instance_missing():

    result = plan_mod.plan(
        node_id=NODE_A,
-        runners=runners,  # type: ignore[arg-type]
+        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
@@ -86,7 +87,8 @@ def test_plan_kills_runner_when_sibling_failed():

    result = plan_mod.plan(
        node_id=NODE_A,
-        runners=runners,  # type: ignore[arg-type]
+        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
@@ -118,6 +120,7 @@ def test_plan_creates_runner_when_missing_for_node():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,
+        download_status={},
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
@@ -155,7 +158,8 @@ def test_plan_does_not_create_runner_when_supervisor_already_present():

    result = plan_mod.plan(
        node_id=NODE_A,
-        runners=runners,  # type: ignore[arg-type]
+        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
@@ -185,6 +189,7 @@ def test_plan_does_not_create_runner_for_unassigned_node():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
--- a/src/exo/worker/tests/unittests/test_plan/test_task_forwarding.py
+++ b/src/exo/worker/tests/unittests/test_plan/test_task_forwarding.py
@@ -65,6 +65,7 @@ def test_plan_forwards_pending_chat_completion_when_runner_ready():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
@@ -112,6 +113,7 @@ def test_plan_does_not_forward_chat_completion_if_any_runner_not_ready():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: [], NODE_B: []},
        instances=instances,
        all_runners=all_runners,
@@ -156,6 +158,7 @@ def test_plan_does_not_forward_tasks_for_other_instances():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
@@ -218,6 +221,7 @@ def test_plan_ignores_non_pending_or_non_chat_tasks():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: [], NODE_B: []},
        instances=instances,
        all_runners=all_runners,
@@ -257,6 +261,7 @@ def test_plan_returns_none_when_nothing_to_do():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: [], NODE_B: []},
        instances=instances,
        all_runners=all_runners,
--- a/src/exo/worker/tests/unittests/test_plan/test_warmup.py
+++ b/src/exo/worker/tests/unittests/test_plan/test_warmup.py
@@ -57,6 +57,7 @@ def test_plan_starts_warmup_for_accepting_rank_when_all_loaded_or_warming():
    result = plan_mod.plan(
        node_id=NODE_B,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
@@ -98,6 +99,7 @@ def test_plan_starts_warmup_for_rank_zero_after_others_warming():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
@@ -138,6 +140,7 @@ def test_plan_does_not_start_warmup_for_non_zero_rank_until_all_loaded_or_warmin
    result = plan_mod.plan(
        node_id=NODE_B,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: [], NODE_B: []},
        instances=instances,
        all_runners=all_runners,
@@ -182,6 +185,7 @@ def test_plan_does_not_start_warmup_for_rank_zero_until_others_warming():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
@@ -198,6 +202,7 @@ def test_plan_does_not_start_warmup_for_rank_zero_until_others_warming():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: []},
        instances=instances,
        all_runners=all_runners,
@@ -241,6 +246,7 @@ def test_plan_starts_warmup_for_connecting_rank_after_others_warming():
    result = plan_mod.plan(
        node_id=NODE_B,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_B: []},
        instances=instances,
        all_runners=all_runners,
@@ -283,6 +289,7 @@ def test_plan_does_not_start_warmup_for_accepting_rank_until_all_loaded_or_warmi
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: [], NODE_B: []},
        instances=instances,
        all_runners=all_runners,
@@ -324,6 +331,7 @@ def test_plan_does_not_start_warmup_for_connecting_rank_until_others_warming():
    result = plan_mod.plan(
        node_id=NODE_A,
        runners=runners,  # type: ignore
+        download_status={},
        global_download_status={NODE_A: [], NODE_B: []},
        instances=instances,
        all_runners=all_runners,
--- a/tests/headless_runner.py
+++ b/tests/headless_runner.py
@@ -11,10 +11,6 @@ from hypercorn.asyncio import serve  # pyright: ignore[reportUnknownVariableType
 from loguru import logger
 from pydantic import BaseModel

-from exo.download.impl_shard_downloader import (
-    build_full_shard,
-    exo_shard_downloader,
-)
 from exo.shared.logging import InterceptLogger, logger_setup
 from exo.shared.models.model_cards import MODEL_CARDS, ModelId
 from exo.shared.types.api import ChatCompletionMessage, ChatCompletionTaskParams
@@ -40,6 +36,10 @@ from exo.shared.types.worker.runners import RunnerId, ShardAssignments
 from exo.shared.types.worker.shards import PipelineShardMetadata, TensorShardMetadata
 from exo.utils.channels import MpReceiver, MpSender, channel, mp_channel
 from exo.utils.info_gatherer.info_gatherer import GatheredInfo, InfoGatherer
+from exo.worker.download.impl_shard_downloader import (
+    build_full_shard,
+    exo_shard_downloader,
+)
 from exo.worker.runner.bootstrap import entrypoint


--- a/uv.lock
+++ b/uv.lock
@@ -412,7 +412,7 @@ requires-dist = [
    { name = "huggingface-hub", specifier = ">=0.33.4" },
    { name = "hypercorn", specifier = ">=0.18.0" },
    { name = "loguru", specifier = ">=0.7.3" },
-    { name = "mflux", specifier = "==0.15.4" },
+    { name = "mflux", specifier = ">=0.14.2" },
    { name = "mlx", marker = "sys_platform == 'darwin'", specifier = "==0.30.3" },
    { name = "mlx", extras = ["cpu"], marker = "sys_platform == 'linux'", specifier = "==0.30.3" },
    { name = "mlx-lm", git = "https://github.com/AlexCheema/mlx-lm.git?rev=fix-transformers-5.0.0rc2" },
@@ -458,6 +458,16 @@ dev = [
    { name = "pytest-asyncio", specifier = ">=1.0.0" },
 ]

+[[package]]
+name = "tomlkit"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/af/14b24e41977adb296d6bd1fb59402cf7d60ce364f90c890bd2ec65c43b5a/tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064", size = 187167 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310 },
+]
+
+
 [[package]]
 name = "fastapi"
 version = "0.128.0"
@@ -987,7 +997,7 @@ wheels = [

 [[package]]
 name = "mflux"
-version = "0.15.4"
+version = "0.15.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "filelock", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -1013,9 +1023,9 @@ dependencies = [
    { name = "twine", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "urllib3", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a6/f8/95322db7a865e4df6bad108b1c99aa7fbe211aac3f298f3ad696c2744a39/mflux-0.15.4.tar.gz", hash = "sha256:138e1aedae86e13eafeb8faec017945fcdcca42c3234daabcd81a83c9a202ace", size = 741228, upload-time = "2026-01-20T15:39:26.807Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/23/c5/dd12e16714702255d89b7ccc6f217c405a9fdcf2af950a2236892c50a219/mflux-0.15.3.tar.gz", hash = "sha256:e32ea66a81aad4f77eea2415b17c27fc3d9ce662a842565c62871ff570f4ef2f", size = 740701, upload-time = "2026-01-19T22:54:59.066Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/8e/be/81cf4ce2d1933b9b210c028a05ac95e958008c0d43e377a5f2757b7f2d4d/mflux-0.15.4-py3-none-any.whl", hash = "sha256:f04d9b1d7c5cd67880f483ab29fb2097648a25459eef9c5ee6480fad46de5e82", size = 987644, upload-time = "2026-01-20T15:39:24.817Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/9f/a673ee12877a0943a4059c51b5beb6cf909c92f25384365cf8beeb475159/mflux-0.15.3-py3-none-any.whl", hash = "sha256:631cfcc038f27e9bd0ff76c25c2bc7373562b8f64cf0ce961fc268a246fa699e", size = 987270, upload-time = "2026-01-19T22:54:57.155Z" },
 ]

 [[package]]
@@ -2217,15 +2227,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/44/6f/7120676b6d73228c96e17f1f794d8ab046fc910d781c8d151120c3f1569e/toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", size = 16588, upload-time = "2020-11-01T01:40:20.672Z" },
 ]

-[[package]]
-name = "tomlkit"
-version = "0.14.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/c3/af/14b24e41977adb296d6bd1fb59402cf7d60ce364f90c890bd2ec65c43b5a/tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064", size = 187167, upload-time = "2026-01-13T01:14:53.304Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310, upload-time = "2026-01-13T01:14:51.965Z" },
-]
-
 [[package]]
 name = "torch"
 version = "2.9.1"
Author	SHA1	Message	Date
Ryuichi Leo Takashige	9c320d7757	Test LRU eviction	2026-01-23 20:43:51 +00:00
Ryuichi Leo Takashige	424d96c6ac	Remove incorrect typing	2026-01-23 20:36:50 +00:00
Ryuichi Leo Takashige	2d42af8477	Add tests	2026-01-23 19:50:36 +00:00
Ryuichi Leo Takashige	a02b452e24	Try and limit memory consumption	2026-01-23 19:50:30 +00:00
Ryuichi Leo Takashige	7744420341	cleanup	2026-01-23 16:32:58 +00:00
Ryuichi Leo Takashige	b777c6f505	Merge remote-tracking branch 'origin/main' into fix-kv-prefix-cache # Conflicts: # .mlx_typings/mlx_lm/tokenizer_utils.pyi # src/exo/worker/engines/mlx/generator/generate.py # src/exo/worker/runner/runner.py	2026-01-23 16:11:26 +00:00
David Hind	812a9f232e	Fix KV prefix cache for prompt reuse - Wire up KVPrefixCache to runner and generate - Fix exact match to return deepcopy (was returning reference) - Fix trim_prompt_cache argument (was using wrong calculation) - Fix token slicing to use best_snapshot_length (not index) - Add _cache_length() using .offset for compatibility with older mlx_lm - Fix prefill() to use max_tokens=1 with trim (workaround for mlx_lm bug) - Add clear() method for single-cache behavior - Remove KEEP_KV_SIZE limit from prefix matching - Add minimal logging for cache hits/misses Fix type errors and KV cache implementation Type fixes for CI: - Add KVCacheType alias matching make_kv_cache return type - Update function signatures to use consistent cache types - Add explicit type annotations KV cache fixes to actually reduce TTFT: - get_kv_cache now prefills internally and returns only last token - stream_generate receives 1 token on cache hit instead of full prompt - Extract encode_prompt as standalone function for reuse Refactor KV cache: move prefill to generate.py, add shared KVCacheType Address PR feedback: - Move KVCacheType to shared/types/mlx.py for reuse across codebase - Move prefill logic from cache.py to generate.py - get_kv_cache now only returns cache + remaining tokens (no prefill) - Caller (mlx_generate) is responsible for prefilling Fix types: regenerate mlx stubs, remove type ignores - Regenerate cache.pyi and tokenizer_utils.pyi stubs for latest mlx_lm - Remove # type: ignore from cache.py (now fully typed) - Remove unnecessary type ignores from generate.py - Use mx.equal() instead of == for proper array typing Fix encode_prompt to not add special tokens for chat-templated prompts Chat templates (like Kimi-K2's <\|im_user\|>, <\|im_middle\|>, etc.) already include their own structure markers. Adding BOS/EOS tokens on top of this corrupts the prompt structure and can slow down prefill. Use add_special_tokens=False since the chat template defines its own structure. Add prefill logging with progress callbacks and timing stats	2026-01-23 15:38:28 +00:00