Compare commits

..

1 Commits

Author SHA1 Message Date
Alex Cheema
c4d24a24e4 Handle MessageTooLarge error in router to prevent node crash
When a message exceeds gossipsub's 1 MiB limit, the resulting
RuntimeError("MessageTooLarge") was unhandled in _networking_publish,
causing the entire TaskGroup to fail and crash the node. Now catch
this error and log it, dropping the oversized message gracefully.

Fixes #1296

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 10:34:22 -08:00
4 changed files with 19 additions and 66 deletions

View File

@@ -211,6 +211,14 @@ class Router:
pass
except AllQueuesFullError:
logger.warning(f"All peer queues full, dropping message on {topic}")
except RuntimeError as e:
if "MessageTooLarge" in str(e):
logger.error(
f"Message too large for gossipsub on topic {topic} "
f"({len(data)} bytes), dropping message"
)
else:
raise
def get_node_id_keypair(

View File

@@ -165,7 +165,6 @@ def is_custom_card(model_id: ModelId) -> bool:
class ConfigData(BaseModel):
model_config = {"extra": "ignore"} # Allow unknown fields
model_type: str | None = None
architectures: list[str] | None = None
hidden_size: Annotated[int, Field(ge=0)] | None = None
layer_count: int = Field(
@@ -201,7 +200,6 @@ class ConfigData(BaseModel):
return data
for field in [
"model_type",
"architectures",
"hidden_size",
"num_hidden_layers",

View File

@@ -269,52 +269,19 @@ def get_tokenizer(model_path: Path, shard_metadata: ShardMetadata) -> TokenizerW
return load_tokenizer_for_model_id(shard_metadata.model_card.model_id, model_path)
def _read_model_type_from_config(model_path: Path) -> str | None:
"""Read the model_type field from config.json at the given model path.
Returns None if config.json doesn't exist or doesn't contain model_type.
def get_eos_token_ids_for_model(model_id: ModelId) -> list[int] | None:
"""
config_path = model_path / "config.json"
if not config_path.exists():
return None
try:
with open(config_path) as f:
config: dict[str, Any] = json.load(f) # pyright: ignore[reportAny]
model_type: Any = config.get("model_type")
if model_type is None:
text_config: Any = config.get("text_config")
if isinstance(text_config, dict):
model_type = text_config.get("model_type") # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType]
return model_type if isinstance(model_type, str) else None
except (json.JSONDecodeError, OSError):
return None
Get the EOS token IDs for a model based on its ID.
def get_eos_token_ids_for_model(
model_id: ModelId, model_type: str | None = None
) -> list[int] | None:
"""Get the EOS token IDs for a model based on its architecture type.
Uses model_type from config.json when available, falls back to model_id
string matching for backward compatibility.
Some models require explicit EOS token configuration that isn't in their
tokenizer config. This function returns the known EOS token IDs for such models.
Args:
model_id: The HuggingFace model ID
model_type: The model_type field from config.json (e.g., "kimi", "glm4")
Returns:
List of EOS token IDs, or None if the model uses standard tokenizer config
"""
if model_type is not None:
if model_type == "kimi":
return [163586]
elif model_type == "glm4_moe_lite":
# 154820: <|endoftext|>, 154827: <|user|>, 154829: <|observation|>
return [154820, 154827, 154829]
elif model_type.startswith("glm"):
return [151336, 151329, 151338]
# Fallback: string matching on model_id
model_id_lower = model_id.lower()
if "kimi-k2" in model_id_lower:
return [163586]
@@ -329,10 +296,11 @@ def get_eos_token_ids_for_model(
def load_tokenizer_for_model_id(
model_id: ModelId, model_path: Path
) -> TokenizerWrapper:
"""Load tokenizer for a model given its ID and local path.
"""
Load tokenizer for a model given its ID and local path.
Uses model_type from config.json for architecture detection when available,
falling back to model_id string matching for backward compatibility.
This is the core tokenizer loading logic, handling special cases for different
model families (Kimi, GLM, etc.) and transformers 5.x compatibility.
Args:
model_id: The HuggingFace model ID (e.g., "moonshotai/Kimi-K2-Instruct")
@@ -341,21 +309,11 @@ def load_tokenizer_for_model_id(
Returns:
TokenizerWrapper instance configured for the model
"""
model_type = _read_model_type_from_config(model_path)
model_id_lower = model_id.lower()
eos_token_ids = get_eos_token_ids_for_model(model_id, model_type=model_type)
is_kimi = (
model_type == "kimi" if model_type is not None else "kimi-k2" in model_id_lower
)
is_gemma3 = (
model_type == "gemma3"
if model_type is not None
else "gemma-3" in model_id_lower
)
eos_token_ids = get_eos_token_ids_for_model(model_id)
# Kimi uses a custom TikTokenTokenizer that transformers 5.x can't load via AutoTokenizer
if is_kimi:
if "kimi-k2" in model_id_lower:
import importlib.util
import types
@@ -409,7 +367,7 @@ def load_tokenizer_for_model_id(
eos_token_ids=eos_token_ids,
)
if is_gemma3:
if "gemma-3" in model_id_lower:
gemma_3_eos_id = 1
gemma_3_end_of_turn_id = 106
if tokenizer.eos_token_ids is not None:

View File

@@ -24,7 +24,6 @@ from exo.worker.engines.mlx.utils_mlx import (
# Files needed for tokenizer functionality
TOKENIZER_FILE_PATTERNS = [
"config.json",
"tokenizer.json",
"tokenizer_config.json",
"special_tokens_map.json",
@@ -339,9 +338,6 @@ async def test_kimi_tokenizer_specifically():
# Verify EOS token is set
assert eos_token_ids == [163586], "Kimi EOS token should be [163586]"
# Verify architecture-based detection gives same result
assert get_eos_token_ids_for_model(model_id, model_type="kimi") == [163586]
# Test GLM tokenizer since it also has special handling
@pytest.mark.asyncio
@@ -382,10 +378,3 @@ async def test_glm_tokenizer_specifically():
151329,
151338,
], "GLM EOS tokens should be correct"
# Verify architecture-based detection gives same result
assert get_eos_token_ids_for_model(model_id, model_type="glm4") == [
151336,
151329,
151338,
]