replace model ID string matching with architecture-based checks in tokenizer loading

Use config.json model_type field instead of fragile substring matching on model IDs for architecture detection in tokenizer loading. Falls back to string matching when config.json is unavailable. Closes #1371 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Revert "Use custom fork that resolves GPU locks" (#1502 )
2026-02-18 23:06:23 -05:00 · 2026-02-17 10:19:09 -08:00 · 2026-02-17 18:18:54 +00:00 · 2026-02-17 18:11:47 +00:00 · 2026-02-17 18:02:32 +00:00 · 2026-02-17 17:52:49 +00:00
4 changed files with 80 additions and 31 deletions
--- a/src/exo/master/adapters/responses.py
+++ b/src/exo/master/adapters/responses.py
@@ -26,7 +26,6 @@ from exo.shared.types.openai_responses import (
    ResponseOutputText,
    ResponsesRequest,
    ResponsesResponse,
-    ResponsesStreamEvent,
    ResponseTextDeltaEvent,
    ResponseTextDoneEvent,
    ResponseUsage,
@@ -34,11 +33,6 @@ from exo.shared.types.openai_responses import (
 from exo.shared.types.text_generation import InputMessage, TextGenerationTaskParams


-def _format_sse(event: ResponsesStreamEvent) -> str:
-    """Format a streaming event as an SSE message."""
-    return f"event: {event.type}\ndata: {event.model_dump_json()}\n\n"
-
-
 def _extract_content(content: str | list[ResponseContentPart]) -> str:
    """Extract plain text from a content field that may be a string or list of parts."""
    if isinstance(content, str):
@@ -213,13 +207,13 @@ async def generate_responses_stream(
    created_event = ResponseCreatedEvent(
        sequence_number=next(seq), response=initial_response
    )
-    yield _format_sse(created_event)
+    yield f"event: response.created\ndata: {created_event.model_dump_json()}\n\n"

    # response.in_progress
    in_progress_event = ResponseInProgressEvent(
        sequence_number=next(seq), response=initial_response
    )
-    yield _format_sse(in_progress_event)
+    yield f"event: response.in_progress\ndata: {in_progress_event.model_dump_json()}\n\n"

    # response.output_item.added
    initial_item = ResponseMessageItem(
@@ -230,7 +224,7 @@ async def generate_responses_stream(
    item_added = ResponseOutputItemAddedEvent(
        sequence_number=next(seq), output_index=0, item=initial_item
    )
-    yield _format_sse(item_added)
+    yield f"event: response.output_item.added\ndata: {item_added.model_dump_json()}\n\n"

    # response.content_part.added
    initial_part = ResponseOutputText(text="")
@@ -241,7 +235,7 @@ async def generate_responses_stream(
        content_index=0,
        part=initial_part,
    )
-    yield _format_sse(part_added)
+    yield f"event: response.content_part.added\ndata: {part_added.model_dump_json()}\n\n"

    accumulated_text = ""
    function_call_items: list[ResponseFunctionCallItem] = []
@@ -272,7 +266,7 @@ async def generate_responses_stream(
                    output_index=next_output_index,
                    item=fc_item,
                )
-                yield _format_sse(fc_added)
+                yield f"event: response.output_item.added\ndata: {fc_added.model_dump_json()}\n\n"

                # response.function_call_arguments.delta
                args_delta = ResponseFunctionCallArgumentsDeltaEvent(
@@ -281,7 +275,7 @@ async def generate_responses_stream(
                    output_index=next_output_index,
                    delta=tool.arguments,
                )
-                yield _format_sse(args_delta)
+                yield f"event: response.function_call_arguments.delta\ndata: {args_delta.model_dump_json()}\n\n"

                # response.function_call_arguments.done
                args_done = ResponseFunctionCallArgumentsDoneEvent(
@@ -291,7 +285,7 @@ async def generate_responses_stream(
                    name=tool.name,
                    arguments=tool.arguments,
                )
-                yield _format_sse(args_done)
+                yield f"event: response.function_call_arguments.done\ndata: {args_done.model_dump_json()}\n\n"

                # response.output_item.done
                fc_done_item = ResponseFunctionCallItem(
@@ -306,7 +300,7 @@ async def generate_responses_stream(
                    output_index=next_output_index,
                    item=fc_done_item,
                )
-                yield _format_sse(fc_item_done)
+                yield f"event: response.output_item.done\ndata: {fc_item_done.model_dump_json()}\n\n"

                function_call_items.append(fc_done_item)
                next_output_index += 1
@@ -322,7 +316,7 @@ async def generate_responses_stream(
            content_index=0,
            delta=chunk.text,
        )
-        yield _format_sse(delta_event)
+        yield f"event: response.output_text.delta\ndata: {delta_event.model_dump_json()}\n\n"

    # response.output_text.done
    text_done = ResponseTextDoneEvent(
@@ -332,7 +326,7 @@ async def generate_responses_stream(
        content_index=0,
        text=accumulated_text,
    )
-    yield _format_sse(text_done)
+    yield f"event: response.output_text.done\ndata: {text_done.model_dump_json()}\n\n"

    # response.content_part.done
    final_part = ResponseOutputText(text=accumulated_text)
@@ -343,7 +337,7 @@ async def generate_responses_stream(
        content_index=0,
        part=final_part,
    )
-    yield _format_sse(part_done)
+    yield f"event: response.content_part.done\ndata: {part_done.model_dump_json()}\n\n"

    # response.output_item.done
    final_message_item = ResponseMessageItem(
@@ -354,7 +348,7 @@ async def generate_responses_stream(
    item_done = ResponseOutputItemDoneEvent(
        sequence_number=next(seq), output_index=0, item=final_message_item
    )
-    yield _format_sse(item_done)
+    yield f"event: response.output_item.done\ndata: {item_done.model_dump_json()}\n\n"

    # Create usage from usage data if available
    usage = None
@@ -379,4 +373,4 @@ async def generate_responses_stream(
    completed_event = ResponseCompletedEvent(
        sequence_number=next(seq), response=final_response
    )
-    yield _format_sse(completed_event)
+    yield f"event: response.completed\ndata: {completed_event.model_dump_json()}\n\n"
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -165,6 +165,7 @@ def is_custom_card(model_id: ModelId) -> bool:
 class ConfigData(BaseModel):
    model_config = {"extra": "ignore"}  # Allow unknown fields

+    model_type: str | None = None
    architectures: list[str] | None = None
    hidden_size: Annotated[int, Field(ge=0)] | None = None
    layer_count: int = Field(
@@ -200,6 +201,7 @@ class ConfigData(BaseModel):
            return data

        for field in [
+            "model_type",
            "architectures",
            "hidden_size",
            "num_hidden_layers",
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -269,19 +269,52 @@ def get_tokenizer(model_path: Path, shard_metadata: ShardMetadata) -> TokenizerW
    return load_tokenizer_for_model_id(shard_metadata.model_card.model_id, model_path)


-def get_eos_token_ids_for_model(model_id: ModelId) -> list[int] | None:
-    """
-    Get the EOS token IDs for a model based on its ID.
+def _read_model_type_from_config(model_path: Path) -> str | None:
+    """Read the model_type field from config.json at the given model path.

-    Some models require explicit EOS token configuration that isn't in their
-    tokenizer config. This function returns the known EOS token IDs for such models.
+    Returns None if config.json doesn't exist or doesn't contain model_type.
+    """
+    config_path = model_path / "config.json"
+    if not config_path.exists():
+        return None
+    try:
+        with open(config_path) as f:
+            config: dict[str, Any] = json.load(f)  # pyright: ignore[reportAny]
+        model_type: Any = config.get("model_type")
+        if model_type is None:
+            text_config: Any = config.get("text_config")
+            if isinstance(text_config, dict):
+                model_type = text_config.get("model_type")  # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType]
+        return model_type if isinstance(model_type, str) else None
+    except (json.JSONDecodeError, OSError):
+        return None
+
+
+def get_eos_token_ids_for_model(
+    model_id: ModelId, model_type: str | None = None
+) -> list[int] | None:
+    """Get the EOS token IDs for a model based on its architecture type.
+
+    Uses model_type from config.json when available, falls back to model_id
+    string matching for backward compatibility.

    Args:
        model_id: The HuggingFace model ID
+        model_type: The model_type field from config.json (e.g., "kimi", "glm4")

    Returns:
        List of EOS token IDs, or None if the model uses standard tokenizer config
    """
+    if model_type is not None:
+        if model_type == "kimi":
+            return [163586]
+        elif model_type == "glm4_moe_lite":
+            # 154820: <|endoftext|>, 154827: <|user|>, 154829: <|observation|>
+            return [154820, 154827, 154829]
+        elif model_type.startswith("glm"):
+            return [151336, 151329, 151338]
+
+    # Fallback: string matching on model_id
    model_id_lower = model_id.lower()
    if "kimi-k2" in model_id_lower:
        return [163586]
@@ -296,11 +329,10 @@ def get_eos_token_ids_for_model(model_id: ModelId) -> list[int] | None:
 def load_tokenizer_for_model_id(
    model_id: ModelId, model_path: Path
 ) -> TokenizerWrapper:
-    """
-    Load tokenizer for a model given its ID and local path.
+    """Load tokenizer for a model given its ID and local path.

-    This is the core tokenizer loading logic, handling special cases for different
-    model families (Kimi, GLM, etc.) and transformers 5.x compatibility.
+    Uses model_type from config.json for architecture detection when available,
+    falling back to model_id string matching for backward compatibility.

    Args:
        model_id: The HuggingFace model ID (e.g., "moonshotai/Kimi-K2-Instruct")
@@ -309,11 +341,21 @@ def load_tokenizer_for_model_id(
    Returns:
        TokenizerWrapper instance configured for the model
    """
+    model_type = _read_model_type_from_config(model_path)
    model_id_lower = model_id.lower()
-    eos_token_ids = get_eos_token_ids_for_model(model_id)
+    eos_token_ids = get_eos_token_ids_for_model(model_id, model_type=model_type)
+
+    is_kimi = (
+        model_type == "kimi" if model_type is not None else "kimi-k2" in model_id_lower
+    )
+    is_gemma3 = (
+        model_type == "gemma3"
+        if model_type is not None
+        else "gemma-3" in model_id_lower
+    )

    # Kimi uses a custom TikTokenTokenizer that transformers 5.x can't load via AutoTokenizer
-    if "kimi-k2" in model_id_lower:
+    if is_kimi:
        import importlib.util
        import types

@@ -367,7 +409,7 @@ def load_tokenizer_for_model_id(
        eos_token_ids=eos_token_ids,
    )

-    if "gemma-3" in model_id_lower:
+    if is_gemma3:
        gemma_3_eos_id = 1
        gemma_3_end_of_turn_id = 106
        if tokenizer.eos_token_ids is not None:
--- a/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
@@ -24,6 +24,7 @@ from exo.worker.engines.mlx.utils_mlx import (

 # Files needed for tokenizer functionality
 TOKENIZER_FILE_PATTERNS = [
+    "config.json",
    "tokenizer.json",
    "tokenizer_config.json",
    "special_tokens_map.json",
@@ -338,6 +339,9 @@ async def test_kimi_tokenizer_specifically():
    # Verify EOS token is set
    assert eos_token_ids == [163586], "Kimi EOS token should be [163586]"

+    # Verify architecture-based detection gives same result
+    assert get_eos_token_ids_for_model(model_id, model_type="kimi") == [163586]
+

 # Test GLM tokenizer since it also has special handling
@pytest.mark.asyncio
@@ -378,3 +382,10 @@ async def test_glm_tokenizer_specifically():
        151329,
        151338,
    ], "GLM EOS tokens should be correct"
+
+    # Verify architecture-based detection gives same result
+    assert get_eos_token_ids_for_model(model_id, model_type="glm4") == [
+        151336,
+        151329,
+        151338,
+    ]
Author	SHA1	Message	Date
Alex Cheema	0c6e943cea	replace model ID string matching with architecture-based checks in tokenizer loading Use config.json model_type field instead of fragile substring matching on model IDs for architecture detection in tokenizer loading. Falls back to string matching when config.json is unavailable. Closes #1371 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 10:19:09 -08:00
rltakashige	83af8c63fa	Revert "Use custom fork that resolves GPU locks" (#1502 ) Reverts exo-explore/exo#1489 Goddammit Claude...	2026-02-17 18:18:54 +00:00
Evan Quiney	eccc6298d1	Revert "Add MetaInstance declarative layer (#1447 )" This reverts commit `a962a28afc`.	2026-02-17 18:11:47 +00:00
Evan Quiney	c8997217cf	Revert "feat: better onboarding UX for new users (#1479 )" This reverts commit `490d2e46ba`.	2026-02-17 18:02:32 +00:00
Alex Cheema	490d2e46ba	feat: better onboarding UX for new users (#1479 ) ## Summary - Auto-open dashboard in browser on first launch (uses `~/.exo/.dashboard_opened` marker) - Welcome overlay with "Choose a Model" CTA button when no model instance is running - Tutorial progress messages during model download → loading → ready lifecycle stages - Fix conversation sidebar text contrast — bumped to white text, added active state background - Simplify technical jargon — sharding/instance type/min nodes hidden behind collapsible "Advanced Options" toggle; strategy display hidden behind debug mode - Polished DMG installer with drag-to-Applications layout, custom branded background, and AppleScript-configured window positioning ## Test plan - [ ] Launch exo for the first time (delete `~/.exo/.dashboard_opened` to simulate) — browser should auto-open - [ ] Verify welcome overlay appears on topology when no model is loaded - [ ] Launch a model and verify download/loading/ready messages appear in instance cards - [ ] Check conversation sidebar text is readable (white on dark, yellow when active) - [ ] Verify "Advanced Options" toggle hides/shows sharding controls - [ ] Build DMG with `packaging/dmg/create-dmg.sh` and verify drag-to-Applications layout 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 17:52:49 +00:00
rltakashige	facf2d4d03	Use custom fork that resolves GPU locks (#1489 ) ## Motivation There is an issue on Macs that means that an explicit synchronization is necessary for memory to be updated from L1 cache. This means that GPU locks can occur when a spin wait does not see the updated timestamp. ## Changes Updated in my own personal fork. ## Why It Works https://github.com/ARM-software/acle/releases ## Test Plan ### Manual Testing Tested manually that no GPU locks occur (even with multiple simultaneous instances running) and that the performance differential is negligible (267 vs 269 tps on Llama 3.2 1B at an approx 10k context.) ------------------------------------------------------ I have seen a GPU lock, specifically when sending a particularly large chat completion while the model was loading. However, I have since been unable to reproduce and this may be something I did wrong. Please do create an issue and tag me if any GPU locks do occur. --------- Co-authored-by: Jake Hillion <jake@hillion.co.uk> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 17:48:43 +00:00
Alex Cheema	a962a28afc	Add MetaInstance declarative layer (#1447 ) ## Motivation Users currently manage instances directly, which means if a node disconnects or connections break, the instance dies and nothing recreates it. MetaInstance is a declarative primitive: "ensure an instance matching these parameters always exists." The reconciler watches for unhealthy or missing backing instances and re-places them automatically. ## Changes - MetaInstance type (`meta_instance.py`): declarative constraint with `model_id`, `min_nodes`, optional `node_ids`, and `sharding` - Reconciler (`reconcile.py`): `find_unsatisfied_meta_instances` checks which MetaInstances lack a healthy backing instance, `try_place_for_meta_instance` creates one - Master loop (`main.py`): periodically reconciles unsatisfied MetaInstances; immediate placement on `CreateMetaInstance` command - API (`api.py`): `create_meta_instance` / `delete_meta_instance` / `GET /meta_instances` endpoints; delete cascades to backing instances with task cancellation - Binding via `meta_instance_id` on Instance (`instances.py`): no separate binding event or backing map — the instance carries its parent MetaInstance ID directly, eliminating race conditions in the reconciler - Dashboard: sidebar shows MetaInstances with their backing instance status; orphan instances (created directly) still shown separately - Tests: constraint matching, connection health, unsatisfied detection, exclusive binding, cascade delete with task cancellation ### Recent improvements - fix: cancel active tasks on cascade delete — `DeleteMetaInstance` now emits `TaskStatusUpdated(Cancelled)` for any Pending/Running tasks on backing instances before emitting `InstanceDeleted`. Previously, cascade-deleting backing instances left orphaned task references in state. - Lifecycle logging — added `logger.info`/`logger.warning` for: `CreateMetaInstance` (model, min_nodes, sharding), `DeleteMetaInstance` (with cascade count), reconciler placement success/failure, and retry decisions with attempt counts in `InstanceHealthReconciler`. - GET `/meta_instances` endpoint — lists all meta-instances without needing to fetch full state. - 2 regression tests — `test_cascade_delete_cancels_active_tasks` and `test_cascade_delete_skips_completed_tasks` verify the cascade-delete event sequence. ## Why It Works Putting `meta_instance_id` on `BaseInstance` makes binding inherent to instance creation. When the reconciler creates an instance for a MetaInstance, it tags it via `model_copy`. When the instance is deleted, the binding disappears with it. This avoids the two bugs that a separate binding mechanism would introduce: 1. Stale exclusion sets — the reconciler loop can't accidentally bind two MetaInstances to the same instance 2. Delete ordering race — no window between deleting an instance and its binding where the reconciler could re-place ## Test Plan ### Manual Testing <!-- Hardware: (e.g., MacBook Pro M1 Max 32GB, Mac Mini M2 16GB, connected via Thunderbolt 4) --> - Created MetaInstance via dashboard, verified instance placed - Verified delete cascades (deleting MetaInstance removes backing instance) - Verified orphan instances still work independently ### Automated Testing - 30 tests in `test_meta_instance_edge_cases.py`: lifecycle, retry logic, error handling, concurrent operations, cascade delete with task cancellation - 24 tests in `test_reconcile.py`: constraint matching, connection health (single/multi-node, edge removal, IP changes), unsatisfied detection, exclusive binding, idempotency - All 261 tests pass - basedpyright 0 errors, ruff clean, dashboard builds --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 09:48:19 -08:00