Revert "Fix kv prefix cache (#1262 )"

This reverts commit cd8c01b7c8.
Merge branch 'main' into releases/v1.0.65
2026-01-26 23:10:01 -05:00 · 2026-01-26 14:07:15 -08:00 · 2026-01-26 14:01:15 -08:00 · 2026-01-26 21:56:58 +00:00 · 2026-01-26 20:13:58 +00:00 · 2026-01-26 11:59:41 -08:00
10 changed files with 762 additions and 82 deletions
--- a/dashboard/parts.nix
+++ b/dashboard/parts.nix
@@ -3,12 +3,28 @@
  perSystem =
    { pkgs, lib, ... }:
    let
+      # Filter source to ONLY include package.json and package-lock.json
+      # This ensures prettier-svelte only rebuilds when lockfiles change
+      dashboardLockfileSrc = lib.cleanSourceWith {
+        src = inputs.self;
+        filter =
+          path: type:
+          let
+            baseName = builtins.baseNameOf path;
+            isDashboardDir = baseName == "dashboard" && type == "directory";
+            isPackageFile =
+              (lib.hasInfix "/dashboard/" path || lib.hasSuffix "/dashboard" (builtins.dirOf path))
+              && (baseName == "package.json" || baseName == "package-lock.json");
+          in
+          isDashboardDir || isPackageFile;
+      };
+
      # Stub source with lockfiles and minimal files for build to succeed
      # This allows prettier-svelte to avoid rebuilding when dashboard source changes
      dashboardStubSrc = pkgs.runCommand "dashboard-stub-src" { } ''
        mkdir -p $out
-        cp ${inputs.self}/dashboard/package.json $out/
-        cp ${inputs.self}/dashboard/package-lock.json $out/
+        cp ${dashboardLockfileSrc}/dashboard/package.json $out/
+        cp ${dashboardLockfileSrc}/dashboard/package-lock.json $out/
        # Minimal files so vite build succeeds (produces empty output)
        echo '<!DOCTYPE html><html><head></head><body></body></html>' > $out/index.html
        mkdir -p $out/src
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ dependencies = [
    "anyio==4.11.0",
    "mlx @ git+https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git; sys_platform == 'darwin'",
    "mlx[cpu]==0.30.3; sys_platform == 'linux'",
-    "mlx-lm @ git+https://github.com/AlexCheema/mlx-lm.git@fix-transformers-5.0.0rc2",
+    "mlx-lm==0.30.5",
    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
    "hypercorn>=0.18.0",
    "openai-harmony>=0.0.8",
--- a/src/exo/download/download_utils.py
+++ b/src/exo/download/download_utils.py
@@ -121,11 +121,20 @@ async def ensure_models_dir() -> Path:


 async def delete_model(model_id: ModelId) -> bool:
-    model_dir = await ensure_models_dir() / model_id.normalize()
-    if not await aios.path.exists(model_dir):
-        return False
-    await asyncio.to_thread(shutil.rmtree, model_dir, ignore_errors=False)
-    return True
+    models_dir = await ensure_models_dir()
+    model_dir = models_dir / model_id.normalize()
+    cache_dir = models_dir / "caches" / model_id.normalize()
+
+    deleted = False
+    if await aios.path.exists(model_dir):
+        await asyncio.to_thread(shutil.rmtree, model_dir, ignore_errors=False)
+        deleted = True
+
+    # Also clear cache
+    if await aios.path.exists(cache_dir):
+        await asyncio.to_thread(shutil.rmtree, cache_dir, ignore_errors=False)
+
+    return deleted


 async def seed_models(seed_dir: str | Path):
@@ -151,16 +160,28 @@ async def fetch_file_list_with_cache(
    target_dir = (await ensure_models_dir()) / "caches" / model_id.normalize()
    await aios.makedirs(target_dir, exist_ok=True)
    cache_file = target_dir / f"{model_id.normalize()}--{revision}--file_list.json"
-    if await aios.path.exists(cache_file):
-        async with aiofiles.open(cache_file, "r") as f:
-            return TypeAdapter(list[FileListEntry]).validate_json(await f.read())
-    file_list = await fetch_file_list_with_retry(
-        model_id, revision, recursive=recursive
-    )
-    await aios.makedirs(cache_file.parent, exist_ok=True)
-    async with aiofiles.open(cache_file, "w") as f:
-        await f.write(TypeAdapter(list[FileListEntry]).dump_json(file_list).decode())
-    return file_list
+
+    # Always try fresh first
+    try:
+        file_list = await fetch_file_list_with_retry(
+            model_id, revision, recursive=recursive
+        )
+        # Update cache with fresh data
+        async with aiofiles.open(cache_file, "w") as f:
+            await f.write(
+                TypeAdapter(list[FileListEntry]).dump_json(file_list).decode()
+            )
+        return file_list
+    except Exception as e:
+        # Fetch failed - try cache fallback
+        if await aios.path.exists(cache_file):
+            logger.warning(
+                f"Failed to fetch file list for {model_id}, using cached data: {e}"
+            )
+            async with aiofiles.open(cache_file, "r") as f:
+                return TypeAdapter(list[FileListEntry]).validate_json(await f.read())
+        # No cache available, propagate the error
+        raise


 async def fetch_file_list_with_retry(
@@ -332,8 +353,28 @@ async def _download_file(
    target_dir: Path,
    on_progress: Callable[[int, int, bool], None] = lambda _, __, ___: None,
 ) -> Path:
-    if await aios.path.exists(target_dir / path):
-        return target_dir / path
+    target_path = target_dir / path
+
+    if await aios.path.exists(target_path):
+        local_size = (await aios.stat(target_path)).st_size
+
+        # Try to verify against remote, but allow offline operation
+        try:
+            remote_size, _ = await file_meta(model_id, revision, path)
+            if local_size != remote_size:
+                logger.info(
+                    f"File {path} size mismatch (local={local_size}, remote={remote_size}), re-downloading"
+                )
+                await aios.remove(target_path)
+            else:
+                return target_path
+        except Exception as e:
+            # Offline or network error - trust local file
+            logger.debug(
+                f"Could not verify {path} against remote (offline?): {e}, using local file"
+            )
+            return target_path
+
    await aios.makedirs((target_dir / path).parent, exist_ok=True)
    length, etag = await file_meta(model_id, revision, path)
    remote_hash = etag[:-5] if etag.endswith("-gzip") else etag
@@ -542,17 +583,26 @@ async def download_shard(
    async def on_progress_wrapper(
        file: FileListEntry, curr_bytes: int, total_bytes: int, is_renamed: bool
    ) -> None:
-        start_time = (
-            file_progress[file.path].start_time
-            if file.path in file_progress
-            else time.time()
-        )
-        downloaded_this_session = (
-            file_progress[file.path].downloaded_this_session.in_bytes
-            + (curr_bytes - file_progress[file.path].downloaded.in_bytes)
-            if file.path in file_progress
-            else curr_bytes
+        previous_progress = file_progress.get(file.path)
+
+        # Detect re-download: curr_bytes < previous downloaded means file was deleted and restarted
+        is_redownload = (
+            previous_progress is not None
+            and curr_bytes < previous_progress.downloaded.in_bytes
        )
+
+        if is_redownload or previous_progress is None:
+            # Fresh download or re-download: reset tracking
+            start_time = time.time()
+            downloaded_this_session = curr_bytes
+        else:
+            # Continuing download: accumulate
+            start_time = previous_progress.start_time
+            downloaded_this_session = (
+                previous_progress.downloaded_this_session.in_bytes
+                + (curr_bytes - previous_progress.downloaded.in_bytes)
+            )
+
        speed = (
            downloaded_this_session / (time.time() - start_time)
            if time.time() - start_time > 0
--- a/src/exo/download/tests/init.py
+++ b/src/exo/download/tests/init.py
--- a/src/exo/download/tests/test_download_verification.py
+++ b/src/exo/download/tests/test_download_verification.py
@@ -0,0 +1,451 @@
+"""Tests for download verification and cache behavior."""
+
+import time
+from collections.abc import AsyncIterator
+from datetime import timedelta
+from pathlib import Path
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import aiofiles
+import aiofiles.os as aios
+import pytest
+from pydantic import TypeAdapter
+
+from exo.download.download_utils import (
+    delete_model,
+    fetch_file_list_with_cache,
+)
+from exo.shared.types.common import ModelId
+from exo.shared.types.memory import Memory
+from exo.shared.types.worker.downloads import FileListEntry, RepoFileDownloadProgress
+
+
+@pytest.fixture
+def model_id() -> ModelId:
+    return ModelId("test-org/test-model")
+
+
+@pytest.fixture
+async def temp_models_dir(tmp_path: Path) -> AsyncIterator[Path]:
+    """Set up a temporary models directory for testing."""
+    models_dir = tmp_path / "models"
+    await aios.makedirs(models_dir, exist_ok=True)
+    with patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir):
+        yield models_dir
+
+
+class TestFileVerification:
+    """Tests for file size verification in _download_file."""
+
+    async def test_redownload_when_file_size_changes_upstream(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """Test that files with mismatched sizes are re-downloaded."""
+        # Import inside test to allow patching
+        from exo.download.download_utils import (
+            _download_file,  # pyright: ignore[reportPrivateUsage]
+        )
+
+        target_dir = tmp_path / "downloads"
+        await aios.makedirs(target_dir, exist_ok=True)
+
+        # Create a local file with wrong size
+        local_file = target_dir / "test.safetensors"
+        async with aiofiles.open(local_file, "wb") as f:
+            await f.write(b"local content")  # 13 bytes
+
+        remote_size = 1000  # Different from local
+        remote_hash = "abc123"
+
+        with (
+            patch(
+                "exo.download.download_utils.file_meta",
+                new_callable=AsyncMock,
+                return_value=(remote_size, remote_hash),
+            ) as mock_file_meta,
+            patch(
+                "exo.download.download_utils.create_http_session"
+            ) as mock_session_factory,
+        ):
+            # Set up mock HTTP response for re-download
+            mock_response = MagicMock()
+            mock_response.status = 200
+            mock_response.content.read = AsyncMock(  # pyright: ignore[reportAny]
+                side_effect=[b"x" * remote_size, b""]
+            )
+
+            mock_session = MagicMock()
+            mock_session.get.return_value.__aenter__ = AsyncMock(  # pyright: ignore[reportAny]
+                return_value=mock_response
+            )
+            mock_session.get.return_value.__aexit__ = AsyncMock(  # pyright: ignore[reportAny]
+                return_value=None
+            )
+            mock_session_factory.return_value.__aenter__ = AsyncMock(  # pyright: ignore[reportAny]
+                return_value=mock_session
+            )
+            mock_session_factory.return_value.__aexit__ = AsyncMock(  # pyright: ignore[reportAny]
+                return_value=None
+            )
+
+            # Mock calc_hash to return the expected hash
+            with patch(
+                "exo.download.download_utils.calc_hash",
+                new_callable=AsyncMock,
+                return_value=remote_hash,
+            ):
+                await _download_file(model_id, "main", "test.safetensors", target_dir)
+
+            # file_meta should be called twice: once for verification, once for download
+            assert mock_file_meta.call_count == 2
+
+    async def test_skip_download_when_file_size_matches(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """Test that files with matching sizes are not re-downloaded."""
+        from exo.download.download_utils import (
+            _download_file,  # pyright: ignore[reportPrivateUsage]
+        )
+
+        target_dir = tmp_path / "downloads"
+        await aios.makedirs(target_dir, exist_ok=True)
+
+        # Create a local file
+        local_file = target_dir / "test.safetensors"
+        local_content = b"local content"
+        async with aiofiles.open(local_file, "wb") as f:
+            await f.write(local_content)
+
+        remote_size = len(local_content)  # Same as local
+        remote_hash = "abc123"
+
+        with (
+            patch(
+                "exo.download.download_utils.file_meta",
+                new_callable=AsyncMock,
+                return_value=(remote_size, remote_hash),
+            ) as mock_file_meta,
+            patch(
+                "exo.download.download_utils.create_http_session"
+            ) as mock_session_factory,
+        ):
+            result = await _download_file(
+                model_id, "main", "test.safetensors", target_dir
+            )
+
+            # Should return immediately without downloading
+            assert result == local_file
+            mock_file_meta.assert_called_once()
+            mock_session_factory.assert_not_called()
+
+    async def test_offline_fallback_uses_local_file(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """Test that local files are used when network is unavailable."""
+        from exo.download.download_utils import (
+            _download_file,  # pyright: ignore[reportPrivateUsage]
+        )
+
+        target_dir = tmp_path / "downloads"
+        await aios.makedirs(target_dir, exist_ok=True)
+
+        # Create a local file
+        local_file = target_dir / "test.safetensors"
+        async with aiofiles.open(local_file, "wb") as f:
+            await f.write(b"local content")
+
+        with (
+            patch(
+                "exo.download.download_utils.file_meta",
+                new_callable=AsyncMock,
+                side_effect=Exception("Network error"),
+            ),
+            patch(
+                "exo.download.download_utils.create_http_session"
+            ) as mock_session_factory,
+        ):
+            result = await _download_file(
+                model_id, "main", "test.safetensors", target_dir
+            )
+
+            # Should return local file without attempting download
+            assert result == local_file
+            mock_session_factory.assert_not_called()
+
+
+class TestFileListCache:
+    """Tests for file list caching behavior."""
+
+    async def test_fetch_fresh_and_update_cache(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """Test that fresh data is fetched and cache is updated."""
+        models_dir = tmp_path / "models"
+
+        file_list = [
+            FileListEntry(type="file", path="model.safetensors", size=1000),
+            FileListEntry(type="file", path="config.json", size=100),
+        ]
+
+        with (
+            patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir),
+            patch(
+                "exo.download.download_utils.fetch_file_list_with_retry",
+                new_callable=AsyncMock,
+                return_value=file_list,
+            ) as mock_fetch,
+        ):
+            result = await fetch_file_list_with_cache(model_id, "main")
+
+            assert result == file_list
+            mock_fetch.assert_called_once()
+
+            # Verify cache was written
+            cache_file = (
+                models_dir
+                / "caches"
+                / model_id.normalize()
+                / f"{model_id.normalize()}--main--file_list.json"
+            )
+            assert await aios.path.exists(cache_file)
+
+            async with aiofiles.open(cache_file, "r") as f:
+                cached_data = TypeAdapter(list[FileListEntry]).validate_json(
+                    await f.read()
+                )
+            assert cached_data == file_list
+
+    async def test_fallback_to_cache_when_fetch_fails(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """Test that cached data is used when fetch fails."""
+        models_dir = tmp_path / "models"
+        cache_dir = models_dir / "caches" / model_id.normalize()
+        await aios.makedirs(cache_dir, exist_ok=True)
+
+        # Create cache file
+        cached_file_list = [
+            FileListEntry(type="file", path="model.safetensors", size=1000),
+        ]
+        cache_file = cache_dir / f"{model_id.normalize()}--main--file_list.json"
+        async with aiofiles.open(cache_file, "w") as f:
+            await f.write(
+                TypeAdapter(list[FileListEntry]).dump_json(cached_file_list).decode()
+            )
+
+        with (
+            patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir),
+            patch(
+                "exo.download.download_utils.fetch_file_list_with_retry",
+                new_callable=AsyncMock,
+                side_effect=Exception("Network error"),
+            ),
+        ):
+            result = await fetch_file_list_with_cache(model_id, "main")
+
+            assert result == cached_file_list
+
+    async def test_error_propagates_when_no_cache(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """Test that errors propagate when fetch fails and no cache exists."""
+        models_dir = tmp_path / "models"
+
+        with (
+            patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir),
+            patch(
+                "exo.download.download_utils.fetch_file_list_with_retry",
+                new_callable=AsyncMock,
+                side_effect=Exception("Network error"),
+            ),
+            pytest.raises(Exception, match="Network error"),
+        ):
+            await fetch_file_list_with_cache(model_id, "main")
+
+
+class TestModelDeletion:
+    """Tests for model deletion including cache cleanup."""
+
+    async def test_delete_model_clears_cache(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """Test that deleting a model also deletes its cache."""
+        models_dir = tmp_path / "models"
+        model_dir = models_dir / model_id.normalize()
+        cache_dir = models_dir / "caches" / model_id.normalize()
+
+        # Create model and cache directories
+        await aios.makedirs(model_dir, exist_ok=True)
+        await aios.makedirs(cache_dir, exist_ok=True)
+
+        # Add some files
+        async with aiofiles.open(model_dir / "model.safetensors", "w") as f:
+            await f.write("model data")
+        async with aiofiles.open(cache_dir / "file_list.json", "w") as f:
+            await f.write("[]")
+
+        with patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir):
+            result = await delete_model(model_id)
+
+            assert result is True
+            assert not await aios.path.exists(model_dir)
+            assert not await aios.path.exists(cache_dir)
+
+    async def test_delete_model_only_cache_exists(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """Test deleting when only cache exists (model already deleted)."""
+        models_dir = tmp_path / "models"
+        cache_dir = models_dir / "caches" / model_id.normalize()
+
+        # Only create cache directory
+        await aios.makedirs(cache_dir, exist_ok=True)
+        async with aiofiles.open(cache_dir / "file_list.json", "w") as f:
+            await f.write("[]")
+
+        with patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir):
+            result = await delete_model(model_id)
+
+            # Returns False because model dir didn't exist
+            assert result is False
+            # But cache should still be cleaned up
+            assert not await aios.path.exists(cache_dir)
+
+    async def test_delete_nonexistent_model(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """Test deleting a model that doesn't exist."""
+        models_dir = tmp_path / "models"
+        await aios.makedirs(models_dir, exist_ok=True)
+
+        with patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir):
+            result = await delete_model(model_id)
+
+            assert result is False
+
+
+class TestProgressResetOnRedownload:
+    """Tests for progress tracking when files are re-downloaded."""
+
+    async def test_progress_resets_correctly_on_redownload(
+        self, model_id: ModelId
+    ) -> None:
+        """Test that progress tracking resets when a file is re-downloaded.
+
+        When a file is deleted and re-downloaded (due to size mismatch),
+        the progress tracking should reset rather than calculating negative
+        downloaded_this_session values.
+        """
+        # Simulate file_progress dict as it exists in download_shard
+        file_progress: dict[str, RepoFileDownloadProgress] = {}
+
+        # Initialize with old file progress (simulating existing large file)
+        old_file_size = 1_500_000_000  # 1.5 GB
+        file_progress["model.safetensors"] = RepoFileDownloadProgress(
+            repo_id=model_id,
+            repo_revision="main",
+            file_path="model.safetensors",
+            downloaded=Memory.from_bytes(old_file_size),
+            downloaded_this_session=Memory.from_bytes(0),
+            total=Memory.from_bytes(old_file_size),
+            speed=0,
+            eta=timedelta(0),
+            status="not_started",
+            start_time=time.time() - 10,  # Started 10 seconds ago
+        )
+
+        # Simulate the logic from on_progress_wrapper after re-download starts
+        # This is the exact logic from the fixed on_progress_wrapper
+        curr_bytes = 100_000  # 100 KB - new download just started
+        previous_progress = file_progress.get("model.safetensors")
+
+        # Detect re-download: curr_bytes < previous downloaded
+        is_redownload = (
+            previous_progress is not None
+            and curr_bytes < previous_progress.downloaded.in_bytes
+        )
+
+        if is_redownload or previous_progress is None:
+            # Fresh download or re-download: reset tracking
+            start_time = time.time()
+            downloaded_this_session = curr_bytes
+        else:
+            # Continuing download: accumulate
+            start_time = previous_progress.start_time
+            downloaded_this_session = (
+                previous_progress.downloaded_this_session.in_bytes
+                + (curr_bytes - previous_progress.downloaded.in_bytes)
+            )
+
+        # Key assertions
+        assert is_redownload is True, "Should detect re-download scenario"
+        assert downloaded_this_session == curr_bytes, (
+            "downloaded_this_session should equal curr_bytes on re-download"
+        )
+        assert downloaded_this_session > 0, (
+            "downloaded_this_session should be positive, not negative"
+        )
+
+        # Calculate speed (should be positive)
+        elapsed = time.time() - start_time
+        speed = downloaded_this_session / elapsed if elapsed > 0 else 0
+        assert speed >= 0, "Speed should be non-negative"
+
+    async def test_progress_accumulates_on_continuing_download(
+        self, model_id: ModelId
+    ) -> None:
+        """Test that progress accumulates correctly for continuing downloads.
+
+        When a download continues from where it left off (resume),
+        the progress should accumulate correctly.
+        """
+        file_progress: dict[str, RepoFileDownloadProgress] = {}
+
+        # Initialize with partial download progress
+        initial_downloaded = 500_000  # 500 KB already downloaded
+        start_time = time.time() - 5  # Started 5 seconds ago
+        file_progress["model.safetensors"] = RepoFileDownloadProgress(
+            repo_id=model_id,
+            repo_revision="main",
+            file_path="model.safetensors",
+            downloaded=Memory.from_bytes(initial_downloaded),
+            downloaded_this_session=Memory.from_bytes(initial_downloaded),
+            total=Memory.from_bytes(1_000_000),
+            speed=100_000,
+            eta=timedelta(seconds=5),
+            status="in_progress",
+            start_time=start_time,
+        )
+
+        # Progress callback with more bytes downloaded
+        curr_bytes = 600_000  # 600 KB - continuing download
+        previous_progress = file_progress.get("model.safetensors")
+
+        # This is NOT a re-download (curr_bytes > previous downloaded)
+        is_redownload = (
+            previous_progress is not None
+            and curr_bytes < previous_progress.downloaded.in_bytes
+        )
+
+        if is_redownload or previous_progress is None:
+            downloaded_this_session = curr_bytes
+            used_start_time = time.time()
+        else:
+            used_start_time = previous_progress.start_time
+            downloaded_this_session = (
+                previous_progress.downloaded_this_session.in_bytes
+                + (curr_bytes - previous_progress.downloaded.in_bytes)
+            )
+
+        # Key assertions
+        assert is_redownload is False, (
+            "Should NOT detect re-download for continuing download"
+        )
+        assert used_start_time == start_time, "Should preserve original start_time"
+        expected_session = initial_downloaded + (curr_bytes - initial_downloaded)
+        assert downloaded_this_session == expected_session, (
+            f"Should accumulate: {downloaded_this_session} == {expected_session}"
+        )
+        assert downloaded_this_session == 600_000, (
+            "downloaded_this_session should equal total downloaded so far"
+        )
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -413,9 +413,9 @@ MODEL_CARDS: dict[str, ModelCard] = {
    ),
 }

-_IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
+_IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
    "flux1-schnell": ModelCard(
-        model_id=ModelId("black-forest-labs/FLUX.1-schnell"),
+        model_id=ModelId("exolabs/FLUX.1-schnell"),
        storage_size=Memory.from_bytes(23782357120 + 9524621312),
        n_layers=57,
        hidden_size=1,
@@ -428,7 +428,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
                storage_size=Memory.from_kb(0),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,  # Single file
+                safetensors_index_filename=None,
            ),
            ComponentInfo(
                component_name="text_encoder_2",
@@ -442,7 +442,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
                component_name="transformer",
                component_path="transformer/",
                storage_size=Memory.from_bytes(23782357120),
-                n_layers=57,  # 19 transformer_blocks + 38 single_transformer_blocks
+                n_layers=57,
                can_shard=True,
                safetensors_index_filename="diffusion_pytorch_model.safetensors.index.json",
            ),
@@ -457,7 +457,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
        ],
    ),
    "flux1-dev": ModelCard(
-        model_id=ModelId("black-forest-labs/FLUX.1-dev"),
+        model_id=ModelId("exolabs/FLUX.1-dev"),
        storage_size=Memory.from_bytes(23782357120 + 9524621312),
        n_layers=57,
        hidden_size=1,
@@ -470,7 +470,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
                storage_size=Memory.from_kb(0),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,  # Single file
+                safetensors_index_filename=None,
            ),
            ComponentInfo(
                component_name="text_encoder_2",
@@ -484,7 +484,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
                component_name="transformer",
                component_path="transformer/",
                storage_size=Memory.from_bytes(23802816640),
-                n_layers=57,  # 19 transformer_blocks + 38 single_transformer_blocks
+                n_layers=57,
                can_shard=True,
                safetensors_index_filename="diffusion_pytorch_model.safetensors.index.json",
            ),
@@ -499,7 +499,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
        ],
    ),
    "flux1-krea-dev": ModelCard(
-        model_id=ModelId("black-forest-labs/FLUX.1-Krea-dev"),
+        model_id=ModelId("exolabs/FLUX.1-Krea-dev"),
        storage_size=Memory.from_bytes(23802816640 + 9524621312),  # Same as dev
        n_layers=57,
        hidden_size=1,
@@ -541,9 +541,9 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
        ],
    ),
    "qwen-image": ModelCard(
-        model_id=ModelId("Qwen/Qwen-Image"),
+        model_id=ModelId("exolabs/Qwen-Image"),
        storage_size=Memory.from_bytes(16584333312 + 40860802176),
-        n_layers=60,  # Qwen has 60 transformer blocks (all joint-style)
+        n_layers=60,
        hidden_size=1,
        supports_tensor=False,
        tasks=[ModelTask.TextToImage],
@@ -551,10 +551,10 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
            ComponentInfo(
                component_name="text_encoder",
                component_path="text_encoder/",
-                storage_size=Memory.from_kb(16584333312),
+                storage_size=Memory.from_bytes(16584333312),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,  # Single file
+                safetensors_index_filename=None,
            ),
            ComponentInfo(
                component_name="transformer",
@@ -575,9 +575,9 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
        ],
    ),
    "qwen-image-edit-2509": ModelCard(
-        model_id=ModelId("Qwen/Qwen-Image-Edit-2509"),
+        model_id=ModelId("exolabs/Qwen-Image-Edit-2509"),
        storage_size=Memory.from_bytes(16584333312 + 40860802176),
-        n_layers=60,  # Qwen has 60 transformer blocks (all joint-style)
+        n_layers=60,
        hidden_size=1,
        supports_tensor=False,
        tasks=[ModelTask.ImageToImage],
@@ -585,10 +585,10 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
            ComponentInfo(
                component_name="text_encoder",
                component_path="text_encoder/",
-                storage_size=Memory.from_kb(16584333312),
+                storage_size=Memory.from_bytes(16584333312),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,  # Single file
+                safetensors_index_filename=None,
            ),
            ComponentInfo(
                component_name="transformer",
@@ -610,6 +610,92 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
    ),
 }

+
+def _generate_image_model_quant_variants(
+    base_name: str,
+    base_card: ModelCard,
+) -> dict[str, ModelCard]:
+    """Create quantized variants of an image model card.
+
+    Only the transformer component is quantized; text encoders stay at bf16.
+    Sizes are calculated exactly from the base card's component sizes.
+    """
+    if base_card.components is None:
+        raise ValueError(f"Image model {base_name} must have components defined")
+
+    # quantizations = [8, 6, 5, 4, 3]
+    quantizations = [8, 4]
+
+    num_transformer_bytes = next(
+        c.storage_size.in_bytes
+        for c in base_card.components
+        if c.component_name == "transformer"
+    )
+
+    transformer_bytes = Memory.from_bytes(num_transformer_bytes)
+
+    remaining_bytes = Memory.from_bytes(
+        sum(
+            c.storage_size.in_bytes
+            for c in base_card.components
+            if c.component_name != "transformer"
+        )
+    )
+
+    def with_transformer_size(new_size: Memory) -> list[ComponentInfo]:
+        assert base_card.components is not None
+        return [
+            ComponentInfo(
+                component_name=c.component_name,
+                component_path=c.component_path,
+                storage_size=new_size
+                if c.component_name == "transformer"
+                else c.storage_size,
+                n_layers=c.n_layers,
+                can_shard=c.can_shard,
+                safetensors_index_filename=c.safetensors_index_filename,
+            )
+            for c in base_card.components
+        ]
+
+    variants = {
+        base_name: ModelCard(
+            model_id=base_card.model_id,
+            storage_size=transformer_bytes + remaining_bytes,
+            n_layers=base_card.n_layers,
+            hidden_size=base_card.hidden_size,
+            supports_tensor=base_card.supports_tensor,
+            tasks=base_card.tasks,
+            components=with_transformer_size(transformer_bytes),
+        )
+    }
+
+    for quant in quantizations:
+        quant_transformer_bytes = Memory.from_bytes(
+            (num_transformer_bytes * quant) // 16
+        )
+        total_bytes = remaining_bytes + quant_transformer_bytes
+
+        model_id = ModelId(base_card.model_id + f"-{quant}bit")
+
+        variants[f"{base_name}-{quant}bit"] = ModelCard(
+            model_id=model_id,
+            storage_size=total_bytes,
+            n_layers=base_card.n_layers,
+            hidden_size=base_card.hidden_size,
+            supports_tensor=base_card.supports_tensor,
+            tasks=base_card.tasks,
+            components=with_transformer_size(quant_transformer_bytes),
+        )
+
+    return variants
+
+
+_image_model_cards: dict[str, ModelCard] = {}
+for _base_name, _base_card in _IMAGE_BASE_MODEL_CARDS.items():
+    _image_model_cards |= _generate_image_model_quant_variants(_base_name, _base_card)
+_IMAGE_MODEL_CARDS = _image_model_cards
+
 if EXO_ENABLE_IMAGE_MODELS:
    MODEL_CARDS.update(_IMAGE_MODEL_CARDS)

--- a/src/exo/utils/info_gatherer/info_gatherer.py
+++ b/src/exo/utils/info_gatherer/info_gatherer.py
@@ -349,13 +349,8 @@ class InfoGatherer:
    async def _monitor_misc(self):
        if self.misc_poll_interval is None:
            return
-        prev = await MiscData.gather()
-        await self.info_sender.send(prev)
        while True:
-            curr = await MiscData.gather()
-            if prev != curr:
-                prev = curr
-                await self.info_sender.send(curr)
+            await self.info_sender.send(await MiscData.gather())
            await anyio.sleep(self.misc_poll_interval)

    async def _monitor_system_profiler_thunderbolt_data(self):
@@ -365,15 +360,12 @@ class InfoGatherer:
        if iface_map is None:
            return

-        old_idents = []
        while True:
            data = await ThunderboltConnectivity.gather()
            assert data is not None

            idents = [it for i in data if (it := i.ident(iface_map)) is not None]
-            if idents != old_idents:
-                await self.info_sender.send(MacThunderboltIdentifiers(idents=idents))
-            old_idents = idents
+            await self.info_sender.send(MacThunderboltIdentifiers(idents=idents))

            conns = [it for i in data if (it := i.conn()) is not None]
            await self.info_sender.send(MacThunderboltConnections(conns=conns))
@@ -398,22 +390,17 @@ class InfoGatherer:
    async def _watch_system_info(self):
        if self.interface_watcher_interval is None:
            return
-        old_nics = []
        while True:
            nics = await get_network_interfaces()
-            if nics != old_nics:
-                old_nics = nics
-                await self.info_sender.send(NodeNetworkInterfaces(ifaces=nics))
+            await self.info_sender.send(NodeNetworkInterfaces(ifaces=nics))
            await anyio.sleep(self.interface_watcher_interval)

    async def _monitor_thunderbolt_bridge_status(self):
        if self.thunderbolt_bridge_poll_interval is None:
            return
-        prev: ThunderboltBridgeInfo | None = None
        while True:
            curr = await ThunderboltBridgeInfo.gather()
-            if curr is not None and prev != curr:
-                prev = curr
+            if curr is not None:
                await self.info_sender.send(curr)
            await anyio.sleep(self.thunderbolt_bridge_poll_interval)

--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -19,6 +19,8 @@ from mlx_lm.models.deepseek_v32 import DeepseekV32MLP
 from mlx_lm.models.deepseek_v32 import Model as DeepseekV32Model
 from mlx_lm.models.glm4_moe import Model as Glm4MoeModel
 from mlx_lm.models.glm4_moe import MoE
+from mlx_lm.models.glm4_moe_lite import Glm4MoeLiteDecoderLayer, Glm4MoeLiteMLP
+from mlx_lm.models.glm4_moe_lite import Model as GLM4MoeLiteModel
 from mlx_lm.models.gpt_oss import GptOssMoeModel
 from mlx_lm.models.gpt_oss import Model as GptOssModel
 from mlx_lm.models.llama import Model as LlamaModel
@@ -334,15 +336,7 @@ def tensor_auto_parallel(
        group=group,
    )

-    if hasattr(model, "shard") and not isinstance(model, GptOssModel):
-        try:
-            model.shard(group)  # type: ignore
-            return patch_tensor_model(model)
-        except (AttributeError, TypeError, NameError):
-            pass
-
    if isinstance(model, (LlamaModel, Ministral3Model)):
-        logger.warning("shouldn't be hit - upstream sharding exists")
        tensor_parallel_sharding_strategy = LlamaShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -351,7 +345,6 @@ def tensor_auto_parallel(
            sharded_to_all_linear_in_place,
        )
    elif isinstance(model, (DeepseekV3Model, DeepseekV32Model)):
-        logger.warning("shouldn't be hit - upstream sharding exists")
        tensor_parallel_sharding_strategy = DeepSeekShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -367,6 +360,14 @@ def tensor_auto_parallel(
            all_to_sharded_linear_in_place,
            sharded_to_all_linear_in_place,
        )
+    elif isinstance(model, GLM4MoeLiteModel):
+        tensor_parallel_sharding_strategy = GLM4MoeLiteShardingStrategy(
+            group,
+            all_to_sharded_linear,
+            sharded_to_all_linear,
+            all_to_sharded_linear_in_place,
+            sharded_to_all_linear_in_place,
+        )
    elif isinstance(model, (Qwen3MoeModel, Glm4MoeModel, Qwen3NextModel)):
        tensor_parallel_sharding_strategy = QwenShardingStrategy(
            group,
@@ -441,7 +442,7 @@ class LlamaShardingStrategy(TensorParallelShardingStrategy):
            layer.mlp.gate_proj = self.all_to_sharded_linear(layer.mlp.gate_proj)
            layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
            layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)
-
+            mx.eval(layer)
        return model


@@ -516,6 +517,8 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
                layer.mlp = ShardedDeepseekV3MoE(layer.mlp)  # type: ignore
                layer.mlp.sharding_group = self.group

+            mx.eval(layer)
+
        return model


@@ -533,6 +536,84 @@ class ShardedDeepseekV3MoE(CustomMlxLayer):
        return y


+class GLM4MoeLiteShardingStrategy(TensorParallelShardingStrategy):
+    def shard_model(
+        self,
+        model: nn.Module,
+        timeout_seconds: float,
+        on_timeout: TimeoutCallback | None,
+    ) -> nn.Module:
+        model = cast(GLM4MoeLiteModel, model)
+        for layer in model.layers:  # type: ignore
+            layer = cast(Glm4MoeLiteDecoderLayer, layer)
+            eval_with_timeout(
+                layer.parameters(),
+                timeout_seconds / len(model.layers),  # type: ignore
+                on_timeout,
+            )
+            if layer.self_attn.q_lora_rank is None:  # type: ignore
+                layer.self_attn.q_proj = self.all_to_sharded_linear(
+                    layer.self_attn.q_proj
+                )
+            else:
+                layer.self_attn.q_b_proj = self.all_to_sharded_linear(
+                    layer.self_attn.q_b_proj
+                )
+
+            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
+            layer.self_attn.num_heads //= self.N
+
+            # Logic from upstream mlx
+            num_heads = layer.self_attn.num_heads
+            sh = self.group.rank() * num_heads
+            eh = sh + num_heads
+
+            def shard_heads(w: mx.array, sh: int = sh, eh: int = eh) -> mx.array:
+                return w[sh:eh]
+
+            layer.self_attn.embed_q.apply(shard_heads)
+            layer.self_attn.unembed_out.apply(shard_heads)
+
+            if isinstance(layer.mlp, Glm4MoeLiteMLP):
+                layer.mlp.gate_proj = self.all_to_sharded_linear(layer.mlp.gate_proj)
+                layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
+                layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)
+
+            else:
+                if getattr(layer.mlp, "shared_experts", None) is not None:
+                    self.all_to_sharded_linear_in_place(
+                        layer.mlp.shared_experts.gate_proj
+                    )
+                    self.sharded_to_all_linear_in_place(
+                        layer.mlp.shared_experts.down_proj
+                    )
+                    self.all_to_sharded_linear_in_place(
+                        layer.mlp.shared_experts.up_proj
+                    )
+                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
+                self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
+                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.up_proj)
+                layer.mlp = ShardedGLM4MoeLiteMoE(layer.mlp)  # type: ignore
+                layer.mlp.sharding_group = self.group  # type: ignore
+            mx.eval(layer)
+
+        return model
+
+
+class ShardedGLM4MoeLiteMoE(CustomMlxLayer):
+    def __init__(self, layer: _LayerCallable):
+        super().__init__(layer)
+        self.sharding_group: mx.distributed.Group | None = None
+
+    def __call__(self, x: mx.array) -> mx.array:
+        if self.sharding_group is not None:
+            x = sum_gradients(self.sharding_group)(x)
+        y = self.original_layer.__call__(x)
+        if self.sharding_group is not None:
+            y = mx.distributed.all_sum(y, group=self.sharding_group)
+        return y
+
+
 class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
    def shard_model(
        self,
@@ -566,7 +647,7 @@ class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
            )
            layer.block_sparse_moe = ShardedQwenMoE(layer.block_sparse_moe)  # pyright: ignore[reportAttributeAccessIssue, reportArgumentType]
            layer.block_sparse_moe.sharding_group = self.group  # pyright: ignore[reportAttributeAccessIssue]
-
+            mx.eval(layer)
        return model


@@ -607,6 +688,7 @@ class QwenShardingStrategy(TensorParallelShardingStrategy):
                layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
                layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)

+            mx.eval(layer)
        return model


@@ -661,7 +743,7 @@ class GptOssShardingStrategy(TensorParallelShardingStrategy):

            layer.mlp = ShardedGptOssMoE(layer.mlp)  # type: ignore
            layer.mlp.sharding_group = self.group  # pyright: ignore[reportAttributeAccessIssue]
-
+            mx.eval(layer)
        return model


--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -405,7 +405,11 @@ def apply_chat_template(
                continue

            message.content = "\n".join(c.text for c in message.content).strip()
-        if message.content is None and message.thinking is None:
+        if (
+            message.content is None
+            and message.thinking is None
+            and message.tool_calls is None
+        ):
            continue

        # Null values are not valid when applying templates in tokenizer
--- a/uv.lock
+++ b/uv.lock
@@ -415,7 +415,7 @@ requires-dist = [
    { name = "mlx", marker = "sys_platform == 'darwin'", git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git" },
    { name = "mflux", specifier = "==0.15.4" },
    { name = "mlx", extras = ["cpu"], marker = "sys_platform == 'linux'", specifier = "==0.30.3" },
-    { name = "mlx-lm", git = "https://github.com/AlexCheema/mlx-lm.git?rev=fix-transformers-5.0.0rc2" },
+    { name = "mlx-lm", specifier = "==0.30.5" },
    { name = "openai-harmony", specifier = ">=0.0.8" },
    { name = "pillow", specifier = ">=11.0,<12.0" },
    { name = "psutil", specifier = ">=7.0.0" },
@@ -1074,8 +1074,8 @@ wheels = [

 [[package]]
 name = "mlx-lm"
-version = "0.30.4"
-source = { git = "https://github.com/AlexCheema/mlx-lm.git?rev=fix-transformers-5.0.0rc2#a5daf2b894f31793dfaef0fdf9bc3ed683176ad6" }
+version = "0.30.5"
+source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "jinja2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "mlx", version = "0.30.4.dev20260121+fbe306f9", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git#fbe306f92a47d9b887ee7af2e3af6f1b9e28e663" }, marker = "sys_platform == 'darwin'" },
@@ -1085,6 +1085,10 @@ dependencies = [
    { name = "sentencepiece", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "transformers", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/0b/90/4469d9f75f196e6255f59a89441abe0079925d30a001462e1c1c4bc4e6a1/mlx_lm-0.30.5.tar.gz", hash = "sha256:9e6cb258c65b766c6af25cb90958aef40acab67139f05839eef19864cb3154f6", size = 262367, upload-time = "2026-01-25T15:29:30.125Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/89/ba/66db6e1e5f1ef506655b562932f6bd8f72600116d5f31f92d71c1f200b3f/mlx_lm-0.30.5-py3-none-any.whl", hash = "sha256:a80bc8e3efdebe81813b0f6eb403fb66a7a15071e256f4e7102ada986acb75bb", size = 366716, upload-time = "2026-01-25T15:29:28.29Z" },
+]

 [[package]]
 name = "more-itertools"
@@ -2273,7 +2277,7 @@ wheels = [

 [[package]]
 name = "transformers"
-version = "5.0.0rc2"
+version = "5.0.0rc3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "filelock", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2288,9 +2292,9 @@ dependencies = [
    { name = "tqdm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "typer-slim", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/94/e2/86b1bd5264272953370a5e50a91da38d7a53a87c5faf3fd3ff62d7353879/transformers-5.0.0rc2.tar.gz", hash = "sha256:9f2fa5e132433dd7eb910dc224b32de0baf758f3b6ffc918dbb632e0af85c07a", size = 8362532, upload-time = "2026-01-07T16:58:02.603Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/3f/a3/7c116a8d85f69ea7749cf4c2df79e64c35d028e5fc7ea0168f299d03b8c7/transformers-5.0.0rc3.tar.gz", hash = "sha256:a0315b92b7e087617ade42ec9e6e92ee7620541cc5d6a3331886c52cbe306f5c", size = 8388520, upload-time = "2026-01-14T16:49:02.952Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b4/eb/9526a77354a2126f5b220f4792dc8494d573773c098dac6a5ad1fc7a5f17/transformers-5.0.0rc2-py3-none-any.whl", hash = "sha256:f8f2a14060ab11f20a0eec39d827af54c1589c327c5799d82808ae3f4167418a", size = 10067329, upload-time = "2026-01-07T16:57:59.617Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/f2/ae2b8968764253bdf38a48dee3c299b8d0bedf7c8ffbe3449fca9bd95338/transformers-5.0.0rc3-py3-none-any.whl", hash = "sha256:383fad27f4f73092d330e45fae384681e5c8521e1dc1cf6cb1a297780e68bf2d", size = 10107087, upload-time = "2026-01-14T16:48:59.393Z" },
 ]

 [[package]]
Author	SHA1	Message	Date
Alex Cheema	9ce0d4602d	Revert "Fix kv prefix cache (#1262 )" This reverts commit `cd8c01b7c8`.	2026-01-26 14:07:15 -08:00
Alex Cheema	4f24e33d30	Merge branch 'main' into releases/v1.0.65	2026-01-26 14:01:15 -08:00
Alex Cheema	bd4f0bf048	Fix download speed/ETA display for re-downloads (#1294 ) ## Motivation After the download verification fix, when files are re-downloaded due to upstream changes (size mismatch), the download progress displays correctly (completion %, bytes, file counts), but speed shows 0 B/s and ETA shows "--" for both overall and per-file progress. ## Changes - Modified `on_progress_wrapper` in `src/exo/download/download_utils.py` to detect re-download scenarios - Added re-download detection: when `curr_bytes < previous_downloaded`, the file was deleted and download restarted - On re-download: reset `start_time` to current time and set `downloaded_this_session = curr_bytes` - Added two tests to `test_download_verification.py` covering re-download and continuing download scenarios ## Why It Works The bug occurred because: 1. `file_progress` is initialized with the OLD local file size (e.g., 1.5GB) 2. When `_download_file` detects size mismatch, it deletes the file and starts fresh 3. Progress callback receives small `curr_bytes` (e.g., 8KB) but compares against old size 4. `downloaded_this_session = 0 + (8KB - 1.5GB) = -1.5GB` (negative!) 5. Negative session bytes → 0 or negative speed → ETA shows "--" The fix detects when `curr_bytes < previous_downloaded` (indicating re-download started) and resets tracking to treat it as a fresh download. ## Test Plan ### Manual Testing <!-- Hardware: (e.g., MacBook Pro M1 Max 32GB, Mac Mini M2 16GB, connected via Thunderbolt 4) --> <!-- What you did: --> - Download a model, modify a file to change its size, restart exo, verify speed/ETA display correctly during re-download ### Automated Testing - Added `TestProgressResetOnRedownload` class with two tests: - `test_progress_resets_correctly_on_redownload`: Verifies progress resets correctly when re-download starts - `test_progress_accumulates_on_continuing_download`: Verifies continuing downloads still accumulate correctly - All 11 download tests pass - Type checking (basedpyright): 0 errors - Linting (ruff): All checks passed 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-26 21:56:58 +00:00
rltakashige	cd8c01b7c8	Fix kv prefix cache (#1262 ) ## Motivation OpenCode sends very large prompts, most of which are repeated on the next call. ## Changes Add prefix caching, reducing average time in prefill (in testing) from 40 seconds to 4. This massively improves user experience. Also evicts KV caches from this prefix cache in a LRU-style manner. ## Why It Works We no longer prefill repeatedly but rather use kv cache stored in memory. A future update may want to use storage to make the prefix cache larger. ## Test Plan ### Manual Testing Tested speedup on OpenCode ### Automated Testing Added a lot of tests --------- Co-authored-by: David Hind <davehind@yahoo.co.uk>	2026-01-26 20:13:58 +00:00
Alex Cheema	a9ee2204ef	Merge remote-tracking branch 'origin/main' into releases/v1.0.65	2026-01-26 11:59:41 -08:00
rltakashige	59e991ce15	Only ignore message if actually empty (#1292 ) ## Motivation <!-- Why is this change needed? What problem does it solve? --> <!-- If it fixes an open issue, please link to the issue here --> ## Changes <!-- Describe what you changed in detail --> ## Why It Works <!-- Explain why your approach solves the problem --> ## Test Plan ### Manual Testing <!-- Hardware: (e.g., MacBook Pro M1 Max 32GB, Mac Mini M2 16GB, connected via Thunderbolt 4) --> <!-- What you did: --> <!-- - --> ### Automated Testing <!-- Describe changes to automated tests, or how existing tests cover this change --> <!-- - -->	2026-01-26 19:33:23 +00:00
ciaranbor	ffba340e70	Ciaran/image quantization (#1272 ) ## Motivation Enable users to select and use quantized variants (8-bit, 4-bit) of image models ## Changes Use exolabs HF org for image models ## Why It Works Quantized versions have been uploaded to exolabs HF org ## Test Plan Loaded and ran different quantized variants. Confirmed lower memory usage and different outputs for the same seed. Verified chat completion still works.	2026-01-26 19:25:05 +00:00
Alex Cheema	054b296a51	Merge remote-tracking branch 'origin/main' into releases/v1.0.65	2026-01-26 09:59:18 -08:00
rltakashige	9968abe816	Leo/fix basic model shard (#1291 ) ## Motivation Some models, on some configurations, would have several issues that caused the model to be stuck on loading. ## Changes Several loading issues were with upstream mlx lm shard loading for tensor parallel. GLM 4.7 Flash now uses GLM 4.7 Lite. A final portion of the issues were from mlx memory not being properly released before calling mx.eval(model), causing the system to run out of memory. ## Test Plan ### Manual Testing Done a bunch (thanks @AlexCheema), hopefully exhaustive. ### Automated Testing A bunch of automated testing is imminent but not landed yet. --------- Co-authored-by: Alex Cheema <alexcheema123@gmail.com>	2026-01-26 17:49:09 +00:00
Alex Cheema	0e30b0830f	Fix download system for upstream file changes (#1290 ) ## Motivation When upstream files change on Hugging Face, exo's download system doesn't detect the change and downloads get stuck. The only workaround is deleting `~/.exo/models/` and the cache. Root causes: 1. Existing files are never re-verified against remote metadata 2. File list cache is never invalidated, causing stale sizes to be used ## Changes 1. Verify existing files against remote size (`_download_file`): Before returning early for existing files, verify the local file size matches remote. If mismatched, delete and re-download. If network fails (offline), fall back to trusting local file. 2. Always try fresh file list first (`fetch_file_list_with_cache`): Always attempt to fetch fresh data from Hugging Face. On success, update the cache. On failure, fall back to cached data if available. 3. Clear cache on model delete (`delete_model`): When a model is deleted, also delete its cache entry to prevent stale metadata. ## Why It Works - Online: Stale local files are detected via size mismatch and re-downloaded. Fresh file list is always fetched and cache is updated. - Offline with cache: Existing files are trusted. Cached file list is used as fallback. - Offline without cache: Fails gracefully (can't download without knowing what files to get). The size check is O(1) so there's no performance impact. Hash verification still happens after download completes (existing behavior). ## Test Plan ### Manual Testing <!-- Hardware: (e.g., MacBook Pro M1 Max 32GB, Mac Mini M2 16GB, connected via Thunderbolt 4) --> <!-- What you did: --> - Download a model, manually modify a local file's content, restart exo, verify it re-downloads ### Automated Testing Added 9 new tests in `src/exo/download/tests/test_download_verification.py`: - Re-download when file size changes upstream - Skip download when file size matches - Offline fallback uses local file - Fetch fresh file list and update cache - Fall back to cache when fetch fails - Error propagates when no cache exists - Model delete clears cache - Delete when only cache exists - Delete nonexistent model All tests pass: `uv run pytest src/exo/download/tests/ -v` Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-26 09:14:58 -08:00
Alex Cheema	44453c4c8b	Remove change-detection checks from info gatherer monitors (#1283 ) ## Summary - When a node times out, its info gets cleared from state. The monitor functions only sent data when something changed, leaving no mechanism to re-populate this info after a timeout. - Removes change-detection checks from `_monitor_misc`, `_monitor_system_profiler_thunderbolt_data`, `_watch_system_info`, and `_monitor_thunderbolt_bridge_status` so data is sent periodically regardless of whether it changed. ## Test plan - [ ] Verify type checker passes: `uv run basedpyright` - [ ] Verify linter passes: `uv run ruff check` - [ ] Verify tests pass: `uv run pytest` - [ ] Manually test that node info is re-populated after a timeout by observing cluster behavior 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-26 12:23:22 +00:00
Jake Hillion	1290e8ed9f	dashboard: fix prettier-svelte rebuilding on every file change The prettier-svelte package was rebuilding whenever any file in the repository changed because dashboardStubSrc referenced inputs.self directly. Since inputs.self's store path hash is computed from the entire repository contents, any file modification invalidated the derivation. Added dashboardLockfileSrc using lib.cleanSourceWith to filter inputs.self to only include package.json and package-lock.json from the dashboard directory. Updated dashboardStubSrc to reference this filtered source instead of inputs.self directly. This ensures prettier-svelte only rebuilds when the lockfiles actually change, significantly improving build caching for unrelated changes. Test plan: - Built prettier-svelte with nix build .#prettier-svelte - Modified src/exo/main.py and rebuilt - same store path (no rebuild) - Modified dashboard/package.json and rebuilt - different store path (rebuild triggered) - Ran nix flake check successfully	2026-01-26 12:02:05 +00:00