remove mdns discovered peers from appearing in state

Update tagline to "Run frontier AI locally" (#1313 )
- Update README tagline from "Run your own AI cluster at home with everyday devices" to "Run frontier AI locally"
2026-01-28 15:52:56 -05:00 · 2026-01-28 12:38:48 +00:00 · 2026-01-28 12:38:14 +00:00 · 2026-01-28 04:30:21 -08:00 · 2026-01-28 05:44:19 +00:00 · 2026-01-27 17:03:01 +00:00
9 changed files with 1144 additions and 1148 deletions
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
  <img alt="exo logo" src="/docs/imgs/exo-logo-transparent.png" width="50%" height="50%">
 </picture>

-exo: Run your own AI cluster at home with everyday devices. Maintained by [exo labs](https://x.com/exolabs).
+exo: Run frontier AI locally. Maintained by [exo labs](https://x.com/exolabs).

 <p align="center">
  <a href="https://discord.gg/TJ4P57arEm" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/badge/Discord-Join%20Server-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 export NIX_CONFIG := "extra-experimental-features = nix-command flakes"

 fmt:
-    nix fmt
+    treefmt || nix fmt

 lint:
    uv run ruff check --fix
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,9 +17,9 @@ dependencies = [
    "loguru>=0.7.3",
    "exo_pyo3_bindings", # rust bindings
    "anyio==4.11.0",
-    "mlx==0.30.3; sys_platform == 'darwin'",
-    "mlx[cpu]==0.30.3; sys_platform == 'linux'",
-    "mlx-lm==0.30.5",
+    "mlx==0.30.4; sys_platform == 'darwin'",
+    "mlx[cpu]==0.30.4; sys_platform == 'linux'",
+    "mlx-lm",
    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
    "hypercorn>=0.18.0",
    "openai-harmony>=0.0.8",
@@ -63,6 +63,7 @@ members = [

 [tool.uv.sources]
 exo_pyo3_bindings = { workspace = true }
+mlx-lm = { git = "https://github.com/ml-explore/mlx-lm", branch = "main" }
 # Uncomment to use local mlx/mlx-lm development versions:
 # mlx = { path = "/Users/Shared/mlx", editable=true }
 # mlx-lm = { path = "/Users/Shared/mlx-lm", editable=true }
--- a/src/exo/main.py
+++ b/src/exo/main.py
@@ -90,7 +90,6 @@ class Node:
            worker = Worker(
                node_id,
                session_id,
-                connection_message_receiver=router.receiver(topics.CONNECTION_MESSAGES),
                global_event_receiver=router.receiver(topics.GLOBAL_EVENTS),
                local_event_sender=router.sender(topics.LOCAL_EVENTS),
                command_sender=router.sender(topics.COMMANDS),
@@ -227,9 +226,6 @@ class Node:
                        self.worker = Worker(
                            self.node_id,
                            result.session_id,
-                            connection_message_receiver=self.router.receiver(
-                                topics.CONNECTION_MESSAGES
-                            ),
                            global_event_receiver=self.router.receiver(
                                topics.GLOBAL_EVENTS
                            ),
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -121,6 +121,14 @@ MODEL_CARDS: dict[str, ModelCard] = {
        supports_tensor=True,
        tasks=[ModelTask.TextGeneration],
    ),
+    "kimi-k2.5": ModelCard(
+        model_id=ModelId("mlx-community/Kimi-K2.5"),
+        storage_size=Memory.from_gb(617),
+        n_layers=61,
+        hidden_size=7168,
+        supports_tensor=True,
+        tasks=[ModelTask.TextGeneration],
+    ),
    # llama-3.1
    "llama-3.1-8b": ModelCard(
        model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"),
--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -23,6 +23,7 @@ from mlx_lm.models.glm4_moe_lite import Glm4MoeLiteDecoderLayer, Glm4MoeLiteMLP
 from mlx_lm.models.glm4_moe_lite import Model as GLM4MoeLiteModel
 from mlx_lm.models.gpt_oss import GptOssMoeModel
 from mlx_lm.models.gpt_oss import Model as GptOssModel
+from mlx_lm.models.kimi_k25 import Model as KimiK25Model
 from mlx_lm.models.llama import Model as LlamaModel
 from mlx_lm.models.minimax import Model as MiniMaxModel
 from mlx_lm.models.ministral3 import Model as Ministral3Model
@@ -344,7 +345,7 @@ def tensor_auto_parallel(
            all_to_sharded_linear_in_place,
            sharded_to_all_linear_in_place,
        )
-    elif isinstance(model, (DeepseekV3Model, DeepseekV32Model)):
+    elif isinstance(model, (DeepseekV3Model, DeepseekV32Model, KimiK25Model)):
        tensor_parallel_sharding_strategy = DeepSeekShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -453,7 +454,7 @@ def _set_layers(model: nn.Module, layers: list[_LayerCallable]) -> None:

        # Update DeepSeek V3 specific parameters when layers are shrunk
        if isinstance(
-            model, (DeepseekV3Model, DeepseekV32Model, Glm4MoeModel)
+            model, (DeepseekV3Model, DeepseekV32Model, Glm4MoeModel, KimiK25Model)
        ) and hasattr(inner_model_instance, "num_layers"):
            logger.info(
                f"Setting num_layers to {len(layers)} for model {model.model.__class__.__name__}"
@@ -622,6 +623,7 @@ class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
        on_timeout: TimeoutCallback | None,
    ) -> nn.Module:
        model = cast(MiniMaxModel, model)
+        rank = self.group.rank()
        for layer in model.layers:
            eval_with_timeout(
                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
@@ -631,6 +633,16 @@ class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
            layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
            layer.self_attn.v_proj = self.all_to_sharded_linear(layer.self_attn.v_proj)
            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
+
+            # Shard qk_norm weights if present (must match sharded head count)
+            if getattr(layer.self_attn, "use_qk_norm", False):
+                layer.self_attn.q_norm.weight = layer.self_attn.q_norm.weight.split(  # type: ignore
+                    self.N, axis=-1
+                )[rank]
+                layer.self_attn.k_norm.weight = layer.self_attn.k_norm.weight.split(  # type: ignore
+                    self.N, axis=-1
+                )[rank]
+
            layer.self_attn.num_attention_heads //= self.N
            layer.self_attn.num_key_value_heads //= self.N

--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -165,12 +165,11 @@ def mlx_distributed_init(

                jaccl_coordinator = jaccl_coordinators[bound_instance.bound_node_id]

-                # TODO: update once upstream fixes
                logger.info(
-                    f"rank {rank} MLX_JACCL_DEVICES: {coordination_file} with devices: {jaccl_devices_json}"
+                    f"rank {rank} MLX_IBV_DEVICES: {coordination_file} with devices: {jaccl_devices_json}"
                )
                logger.info(f"rank {rank} MLX_JACCL_COORDINATOR: {jaccl_coordinator}")
-                os.environ["MLX_JACCL_DEVICES"] = coordination_file
+                os.environ["MLX_IBV_DEVICES"] = coordination_file
                os.environ["MLX_RANK"] = str(rank)
                os.environ["MLX_JACCL_COORDINATOR"] = jaccl_coordinator
                group = mx.distributed.init(backend="jaccl", strict=True)
@@ -259,10 +258,10 @@ def shard_and_load(

    logger.info(f"Group size: {group.size()}, group rank: {group.rank()}")

-    # Estimate timeout based on model size
-    base_timeout = float(os.environ.get("EXO_MODEL_LOAD_TIMEOUT", "60"))
+    # Estimate timeout based on model size (5x default for large queued workloads)
+    base_timeout = float(os.environ.get("EXO_MODEL_LOAD_TIMEOUT", "300"))
    model_size_gb = get_weights_size(shard_metadata).in_bytes / (1024**3)
-    timeout_seconds = base_timeout + model_size_gb / 5
+    timeout_seconds = base_timeout + model_size_gb
    logger.info(
        f"Evaluating model parameters with timeout of {timeout_seconds:.0f}s "
        f"(model size: {model_size_gb:.1f}GB)"
@@ -339,8 +338,35 @@ def load_tokenizer_for_model_id(

    # Kimi uses a custom TikTokenTokenizer that transformers 5.x can't load via AutoTokenizer
    if "kimi-k2" in model_id_lower:
+        import importlib.util
+        import types
+
        sys.path.insert(0, str(model_path))
-        from tokenization_kimi import TikTokenTokenizer  # type: ignore[import-not-found]  # noqa: I001
+
+        # Load tool_declaration_ts first (tokenization_kimi imports it with relative import)
+        tool_decl_path = model_path / "tool_declaration_ts.py"
+        if tool_decl_path.exists():
+            spec = importlib.util.spec_from_file_location(
+                "tool_declaration_ts", tool_decl_path
+            )
+            if spec and spec.loader:
+                tool_decl_module = importlib.util.module_from_spec(spec)
+                sys.modules["tool_declaration_ts"] = tool_decl_module
+                spec.loader.exec_module(tool_decl_module)
+
+        # Load tokenization_kimi with patched source (convert relative to absolute import)
+        tok_path = model_path / "tokenization_kimi.py"
+        source = tok_path.read_text()
+        source = source.replace("from .tool_declaration_ts", "from tool_declaration_ts")
+        spec = importlib.util.spec_from_file_location("tokenization_kimi", tok_path)
+        if spec:
+            tok_module = types.ModuleType("tokenization_kimi")
+            tok_module.__file__ = str(tok_path)
+            sys.modules["tokenization_kimi"] = tok_module
+            exec(compile(source, tok_path, "exec"), tok_module.__dict__)  # noqa: S102
+            TikTokenTokenizer = tok_module.TikTokenTokenizer  # type: ignore[attr-defined]  # noqa: N806
+        else:
+            from tokenization_kimi import TikTokenTokenizer  # type: ignore[import-not-found]  # noqa: I001

        hf_tokenizer: Any = TikTokenTokenizer.from_pretrained(model_path)  # pyright: ignore[reportUnknownVariableType,reportUnknownMemberType]

--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -7,7 +7,6 @@ from anyio import CancelScope, create_task_group, fail_after
 from anyio.abc import TaskGroup
 from loguru import logger

-from exo.routing.connection_message import ConnectionMessage, ConnectionMessageType
 from exo.shared.apply import apply
 from exo.shared.models.model_cards import ModelId
 from exo.shared.types.api import ImageEditsInternalParams
@@ -57,7 +56,6 @@ class Worker:
        node_id: NodeId,
        session_id: SessionId,
        *,
-        connection_message_receiver: Receiver[ConnectionMessage],
        global_event_receiver: Receiver[ForwarderEvent],
        local_event_sender: Sender[ForwarderEvent],
        # This is for requesting updates. It doesn't need to be a general command sender right now,
@@ -74,7 +72,6 @@ class Worker:
        self.event_index_counter = event_index_counter
        self.command_sender = command_sender
        self.download_command_sender = download_command_sender
-        self.connection_message_receiver = connection_message_receiver
        self.event_buffer = OrderedBuffer[Event]()
        self.out_for_delivery: dict[EventId, ForwarderEvent] = {}

@@ -105,7 +102,6 @@ class Worker:
            tg.start_soon(info_gatherer.run)
            tg.start_soon(self._forward_info, info_recv)
            tg.start_soon(self.plan_step)
-            tg.start_soon(self._connection_message_event_writer)
            tg.start_soon(self._resend_out_for_delivery)
            tg.start_soon(self._event_applier)
            tg.start_soon(self._forward_events)
@@ -279,41 +275,6 @@ class Worker:
        instance = self.state.instances[task.instance_id]
        return instance.shard_assignments.node_to_runner[self.node_id]

-    async def _connection_message_event_writer(self):
-        with self.connection_message_receiver as connection_messages:
-            async for msg in connection_messages:
-                await self.event_sender.send(
-                    self._convert_connection_message_to_event(msg)
-                )
-
-    def _convert_connection_message_to_event(self, msg: ConnectionMessage):
-        match msg.connection_type:
-            case ConnectionMessageType.Connected:
-                return TopologyEdgeCreated(
-                    conn=Connection(
-                        source=self.node_id,
-                        sink=msg.node_id,
-                        edge=SocketConnection(
-                            sink_multiaddr=Multiaddr(
-                                address=f"/ip4/{msg.remote_ipv4}/tcp/{msg.remote_tcp_port}"
-                            ),
-                        ),
-                    ),
-                )
-
-            case ConnectionMessageType.Disconnected:
-                return TopologyEdgeDeleted(
-                    conn=Connection(
-                        source=self.node_id,
-                        sink=msg.node_id,
-                        edge=SocketConnection(
-                            sink_multiaddr=Multiaddr(
-                                address=f"/ip4/{msg.remote_ipv4}/tcp/{msg.remote_tcp_port}"
-                            ),
-                        ),
-                    ),
-                )
-
    async def _nack_request(self, since_idx: int) -> None:
        # We request all events after (and including) the missing index.
        # This function is started whenever we receive an event that is out of sequence.
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
Evan	40c74355f7	remove mdns discovered peers from appearing in state	2026-01-28 12:38:48 +00:00
Alex Cheema	f1a2d054ec	Update tagline to "Run frontier AI locally" (#1313 ) - Update README tagline from "Run your own AI cluster at home with everyday devices" to "Run frontier AI locally"	2026-01-28 12:38:14 +00:00
Alex Cheema	b3c8f85fc8	Update MLX to 0.30.4 (#1311 ) ## Summary - Bump mlx from 0.30.3 to 0.30.4 ## Test plan - [x] `uv lock` succeeds - [x] Type checking passes (`uv run basedpyright`) - [x] Run inference tests 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-28 04:30:21 -08:00
rltakashige	a562114ba5	Add Kimi K2.5 support (#1302 ) ## Motivation <!-- Why is this change needed? What problem does it solve? --> <!-- If it fixes an open issue, please link to the issue here --> ## Changes <!-- Describe what you changed in detail --> ## Why It Works <!-- Explain why your approach solves the problem --> ## Test Plan ### Manual Testing <!-- Hardware: (e.g., MacBook Pro M1 Max 32GB, Mac Mini M2 16GB, connected via Thunderbolt 4) --> <!-- What you did: --> <!-- - --> ### Automated Testing <!-- Describe changes to automated tests, or how existing tests cover this change --> <!-- - --> --------- Co-authored-by: Alex Cheema <41707476+AlexCheema@users.noreply.github.com>	2026-01-28 05:44:19 +00:00
Evan Quiney	991d278119	replace nix fmt with treefmt in just lint (#1301 ) man evaluating the nix flake is so slow. treefmt speeeedy	2026-01-27 17:03:01 +00:00
rltakashige	c55cbf6739	Add mlx lm style tensor sharding for Minimax (#1299 ) ## Motivation Broken right now. We'll potentially add a better one later ## Changes <!-- Describe what you changed in detail --> ## Why It Works <!-- Explain why your approach solves the problem --> ## Test Plan ### Manual Testing Used for evals without any issue. ### Automated Testing <!-- Describe changes to automated tests, or how existing tests cover this change --> <!-- - -->	2026-01-27 15:29:06 +00:00