add GLM-5 model support

- Add model cards for GLM-5 variants: MXFP4-Q8, 4bit, 8bit-MXFP8, bf16 - Add GlmMoeDsaForCausalLM to supports_tensor whitelist - Fix EOS tokens: GLM-5 and GLM-4.7 share the same EOS tokens (154820, 154827, 154829), consolidate the match condition Note: full tensor parallel support for GLM-5 requires additional auto_parallel changes (CacheList compatibility, NullIndexer for DSA) and upstream MLX fixes for MXFP quantized matmul. Closes #1468 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 14:55:13 -05:00 · 2026-02-17 10:20:34 -08:00
6 changed files with 50 additions and 1 deletions
--- a/resources/inference_model_cards/mlx-community--GLM-5-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-4bit.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/GLM-5-4bit"
+n_layers = 78
+hidden_size = 6144
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "glm"
+quantization = "4bit"
+base_model = "GLM 5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 418621403136
--- a/resources/inference_model_cards/mlx-community--GLM-5-8bit-MXFP8.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-8bit-MXFP8.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/GLM-5-8bit-MXFP8"
+n_layers = 78
+hidden_size = 6144
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "glm"
+quantization = "8bit"
+base_model = "GLM 5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 767273926656
--- a/resources/inference_model_cards/mlx-community--GLM-5-MXFP4-Q8.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-MXFP4-Q8.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/GLM-5-MXFP4-Q8"
+n_layers = 78
+hidden_size = 6144
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "glm"
+quantization = "MXFP4-Q8"
+base_model = "GLM 5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 405480321024
--- a/resources/inference_model_cards/mlx-community--GLM-5.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/GLM-5"
+n_layers = 78
+hidden_size = 6144
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "glm"
+quantization = "bf16"
+base_model = "GLM 5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 1487822475264
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -182,6 +182,7 @@ class ConfigData(BaseModel):
    def supports_tensor(self) -> bool:
        return self.architectures in [
            ["Glm4MoeLiteForCausalLM"],
+            ["GlmMoeDsaForCausalLM"],
            ["DeepseekV32ForCausalLM"],
            ["DeepseekV3ForCausalLM"],
            ["Qwen3NextForCausalLM"],
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -285,7 +285,7 @@ def get_eos_token_ids_for_model(model_id: ModelId) -> list[int] | None:
    model_id_lower = model_id.lower()
    if "kimi-k2" in model_id_lower:
        return [163586]
-    elif "glm-4.7-flash" in model_id_lower:
+    elif "glm-5" in model_id_lower or "glm-4.7" in model_id_lower:
        # 154820: <|endoftext|>, 154827: <|user|>, 154829: <|observation|>
        return [154820, 154827, 154829]
    elif "glm" in model_id_lower: