Compare commits

..

1 Commits

Author SHA1 Message Date
Alex Cheema
21b594b176 add GLM-5 model support
- Add model cards for GLM-5 variants: MXFP4-Q8, 4bit, 8bit-MXFP8, bf16
- Add GlmMoeDsaForCausalLM to supports_tensor whitelist
- Fix EOS tokens: GLM-5 and GLM-4.7 share the same EOS tokens
  (154820, 154827, 154829), consolidate the match condition

Note: full tensor parallel support for GLM-5 requires additional
auto_parallel changes (CacheList compatibility, NullIndexer for DSA)
and upstream MLX fixes for MXFP quantized matmul.

Closes #1468

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 10:20:34 -08:00
7 changed files with 54 additions and 18 deletions

View File

@@ -0,0 +1,12 @@
model_id = "mlx-community/GLM-5-4bit"
n_layers = 78
hidden_size = 6144
supports_tensor = true
tasks = ["TextGeneration"]
family = "glm"
quantization = "4bit"
base_model = "GLM 5"
capabilities = ["text", "thinking"]
[storage_size]
in_bytes = 418621403136

View File

@@ -0,0 +1,12 @@
model_id = "mlx-community/GLM-5-8bit-MXFP8"
n_layers = 78
hidden_size = 6144
supports_tensor = true
tasks = ["TextGeneration"]
family = "glm"
quantization = "8bit"
base_model = "GLM 5"
capabilities = ["text", "thinking"]
[storage_size]
in_bytes = 767273926656

View File

@@ -0,0 +1,12 @@
model_id = "mlx-community/GLM-5-MXFP4-Q8"
n_layers = 78
hidden_size = 6144
supports_tensor = true
tasks = ["TextGeneration"]
family = "glm"
quantization = "MXFP4-Q8"
base_model = "GLM 5"
capabilities = ["text", "thinking"]
[storage_size]
in_bytes = 405480321024

View File

@@ -0,0 +1,12 @@
model_id = "mlx-community/GLM-5"
n_layers = 78
hidden_size = 6144
supports_tensor = true
tasks = ["TextGeneration"]
family = "glm"
quantization = "bf16"
base_model = "GLM 5"
capabilities = ["text", "thinking"]
[storage_size]
in_bytes = 1487822475264

View File

@@ -149,12 +149,7 @@ from exo.shared.types.openai_responses import (
ResponsesResponse,
)
from exo.shared.types.state import State
from exo.shared.types.worker.instances import (
Instance,
InstanceId,
InstanceMeta,
MlxJacclInstance,
)
from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
from exo.shared.types.worker.shards import Sharding
from exo.utils.banner import print_startup_banner
from exo.utils.channels import Receiver, Sender, channel
@@ -496,14 +491,6 @@ class API:
shard_assignments = instance.shard_assignments
placement_node_ids = list(shard_assignments.node_to_runner.keys())
# Derive instance_meta from the actual instance type, since
# place_instance() may override it (e.g., single-node → MlxRing)
actual_instance_meta = (
InstanceMeta.MlxJaccl
if isinstance(instance, MlxJacclInstance)
else InstanceMeta.MlxRing
)
memory_delta_by_node: dict[str, int] = {}
if placement_node_ids:
total_bytes = model_card.storage_size.in_bytes
@@ -516,14 +503,14 @@ class API:
if (
model_card.model_id,
sharding,
actual_instance_meta,
instance_meta,
len(placement_node_ids),
) not in seen:
previews.append(
PlacementPreview(
model_id=model_card.model_id,
sharding=sharding,
instance_meta=actual_instance_meta,
instance_meta=instance_meta,
instance=instance,
memory_delta_by_node=memory_delta_by_node or None,
error=None,
@@ -533,7 +520,7 @@ class API:
(
model_card.model_id,
sharding,
actual_instance_meta,
instance_meta,
len(placement_node_ids),
)
)

View File

@@ -182,6 +182,7 @@ class ConfigData(BaseModel):
def supports_tensor(self) -> bool:
return self.architectures in [
["Glm4MoeLiteForCausalLM"],
["GlmMoeDsaForCausalLM"],
["DeepseekV32ForCausalLM"],
["DeepseekV3ForCausalLM"],
["Qwen3NextForCausalLM"],

View File

@@ -285,7 +285,7 @@ def get_eos_token_ids_for_model(model_id: ModelId) -> list[int] | None:
model_id_lower = model_id.lower()
if "kimi-k2" in model_id_lower:
return [163586]
elif "glm-4.7-flash" in model_id_lower:
elif "glm-5" in model_id_lower or "glm-4.7" in model_id_lower:
# 154820: <|endoftext|>, 154827: <|user|>, 154829: <|observation|>
return [154820, 154827, 154829]
elif "glm" in model_id_lower: