fix instance type mismatch in /instance/previews endpoint

Derive instance_meta from the actual instance type returned by place_instance() instead of using the requested instance_meta from the loop variable. place_instance() overrides single-node placements to MlxRing, but the preview response was still reporting the original requested type (e.g., MlxJaccl), causing a mismatch. Closes #1426 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 23:06:23 -05:00 · 2026-02-17 10:09:17 -08:00
7 changed files with 18 additions and 54 deletions
--- a/resources/inference_model_cards/mlx-community--GLM-5-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-4bit.toml
@@ -1,12 +0,0 @@
-model_id = "mlx-community/GLM-5-4bit"
-n_layers = 78
-hidden_size = 6144
-supports_tensor = true
-tasks = ["TextGeneration"]
-family = "glm"
-quantization = "4bit"
-base_model = "GLM 5"
-capabilities = ["text", "thinking"]
-
-[storage_size]
-in_bytes = 418621403136
--- a/resources/inference_model_cards/mlx-community--GLM-5-8bit-MXFP8.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-8bit-MXFP8.toml
@@ -1,12 +0,0 @@
-model_id = "mlx-community/GLM-5-8bit-MXFP8"
-n_layers = 78
-hidden_size = 6144
-supports_tensor = true
-tasks = ["TextGeneration"]
-family = "glm"
-quantization = "8bit"
-base_model = "GLM 5"
-capabilities = ["text", "thinking"]
-
-[storage_size]
-in_bytes = 767273926656
--- a/resources/inference_model_cards/mlx-community--GLM-5-MXFP4-Q8.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-MXFP4-Q8.toml
@@ -1,12 +0,0 @@
-model_id = "mlx-community/GLM-5-MXFP4-Q8"
-n_layers = 78
-hidden_size = 6144
-supports_tensor = true
-tasks = ["TextGeneration"]
-family = "glm"
-quantization = "MXFP4-Q8"
-base_model = "GLM 5"
-capabilities = ["text", "thinking"]
-
-[storage_size]
-in_bytes = 405480321024
--- a/resources/inference_model_cards/mlx-community--GLM-5.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5.toml
@@ -1,12 +0,0 @@
-model_id = "mlx-community/GLM-5"
-n_layers = 78
-hidden_size = 6144
-supports_tensor = true
-tasks = ["TextGeneration"]
-family = "glm"
-quantization = "bf16"
-base_model = "GLM 5"
-capabilities = ["text", "thinking"]
-
-[storage_size]
-in_bytes = 1487822475264
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -149,7 +149,12 @@ from exo.shared.types.openai_responses import (
    ResponsesResponse,
 )
 from exo.shared.types.state import State
-from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
+from exo.shared.types.worker.instances import (
+    Instance,
+    InstanceId,
+    InstanceMeta,
+    MlxJacclInstance,
+)
 from exo.shared.types.worker.shards import Sharding
 from exo.utils.banner import print_startup_banner
 from exo.utils.channels import Receiver, Sender, channel
@@ -491,6 +496,14 @@ class API:
            shard_assignments = instance.shard_assignments
            placement_node_ids = list(shard_assignments.node_to_runner.keys())

+            # Derive instance_meta from the actual instance type, since
+            # place_instance() may override it (e.g., single-node → MlxRing)
+            actual_instance_meta = (
+                InstanceMeta.MlxJaccl
+                if isinstance(instance, MlxJacclInstance)
+                else InstanceMeta.MlxRing
+            )
+
            memory_delta_by_node: dict[str, int] = {}
            if placement_node_ids:
                total_bytes = model_card.storage_size.in_bytes
@@ -503,14 +516,14 @@ class API:
            if (
                model_card.model_id,
                sharding,
-                instance_meta,
+                actual_instance_meta,
                len(placement_node_ids),
            ) not in seen:
                previews.append(
                    PlacementPreview(
                        model_id=model_card.model_id,
                        sharding=sharding,
-                        instance_meta=instance_meta,
+                        instance_meta=actual_instance_meta,
                        instance=instance,
                        memory_delta_by_node=memory_delta_by_node or None,
                        error=None,
@@ -520,7 +533,7 @@ class API:
                (
                    model_card.model_id,
                    sharding,
-                    instance_meta,
+                    actual_instance_meta,
                    len(placement_node_ids),
                )
            )
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -182,7 +182,6 @@ class ConfigData(BaseModel):
    def supports_tensor(self) -> bool:
        return self.architectures in [
            ["Glm4MoeLiteForCausalLM"],
-            ["GlmMoeDsaForCausalLM"],
            ["DeepseekV32ForCausalLM"],
            ["DeepseekV3ForCausalLM"],
            ["Qwen3NextForCausalLM"],
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -285,7 +285,7 @@ def get_eos_token_ids_for_model(model_id: ModelId) -> list[int] | None:
    model_id_lower = model_id.lower()
    if "kimi-k2" in model_id_lower:
        return [163586]
-    elif "glm-5" in model_id_lower or "glm-4.7" in model_id_lower:
+    elif "glm-4.7-flash" in model_id_lower:
        # 154820: <|endoftext|>, 154827: <|user|>, 154829: <|observation|>
        return [154820, 154827, 154829]
    elif "glm" in model_id_lower: