Add support for Step 3.5 flash! (#1460 )

## Motivation Working version of #1366 ## Changes Add Step 3.5 Flash ## Test Plan ### Manual Testing Works! ### Automated Testing Running two processes tensor/pipeline sharded gives same logits as single process.
2026-02-13 07:32:30 -05:00 · 2026-02-13 12:10:18 +00:00
17 changed files with 255 additions and 563 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,15 +0,0 @@
-.venv/
-.direnv/
-target/
-.git/
-.idea/
-.pytest_cache/
-.ruff_cache/
-dashboard/node_modules/
-dashboard/.svelte-kit/
-dashboard/build/
-dist/
-*.pdb
-**/__pycache__
-**/.DS_Store
-.mlx_typings/
--- a/.github/workflows/e2e.yml
+++ b/.github/workflows/e2e.yml
@@ -1,29 +0,0 @@
-name: e2e-tests
-
-on:
-  push:
-  pull_request:
-    branches:
-      - staging
-      - main
-
-jobs:
-  e2e:
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    steps:
-      - name: Free up disk space
-        run: |
-          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
-            /opt/hostedtoolcache /usr/local/share/boost /usr/share/swift \
-            /opt/microsoft /opt/az
-          docker system prune -af
-          df -h /
-
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          lfs: false
-
-      - name: Run E2E tests
-        run: python3 e2e/run_all.py
--- a/.mlx_typings/mlx_lm/models/step3p5.pyi
+++ b/.mlx_typings/mlx_lm/models/step3p5.pyi
@@ -0,0 +1,151 @@
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from .base import BaseModelArgs
+from .switch_layers import SwitchGLU
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str
+    hidden_size: int
+    num_hidden_layers: int
+    vocab_size: int
+    num_attention_heads: int
+    num_attention_groups: int
+    head_dim: int
+    intermediate_size: int
+    rms_norm_eps: float
+    rope_theta: float
+    rope_scaling: Optional[Dict[str, Any]]
+    max_position_embeddings: int
+    sliding_window: int
+    layer_types: Optional[List[str]]
+    yarn_only_types: Optional[List[str]]
+    partial_rotary_factors: Optional[List[float]]
+    attention_other_setting: Optional[Dict[str, Any]]
+    use_head_wise_attn_gate: bool
+    moe_num_experts: int
+    moe_top_k: int
+    moe_intermediate_size: int
+    share_expert_dim: int
+    moe_layers_enum: Optional[str]
+    moe_router_scaling_factor: float
+    norm_expert_weight: bool
+    swiglu_limits: Optional[List[float]]
+    swiglu_limits_shared: Optional[List[float]]
+    tie_word_embeddings: bool
+
+class Step3p5MLP(nn.Module):
+    hidden_size: int
+    intermediate_size: int
+    gate_proj: nn.Linear
+    up_proj: nn.Linear
+    down_proj: nn.Linear
+    limit: Optional[float]
+
+    def __init__(
+        self, args: ModelArgs, intermediate_size: int, swiglu_limit: float = 0
+    ) -> None: ...
+    def __call__(self, x: mx.array) -> mx.array: ...
+
+class Step3p5MoEGate(nn.Module):
+    top_k: int
+    n_routed_experts: int
+    routed_scaling_factor: float
+    norm_topk_prob: bool
+    gate: nn.Linear
+    router_bias: mx.array
+
+    def __init__(self, args: ModelArgs) -> None: ...
+    def __call__(self, x: mx.array) -> tuple[mx.array, mx.array]: ...
+
+class Step3p5MoE(nn.Module):
+    gate: Step3p5MoEGate
+    switch_mlp: SwitchGLU
+    share_expert: Step3p5MLP
+    sharding_group: Optional[mx.distributed.Group]
+
+    def __init__(self, args: ModelArgs, layer_idx: int) -> None: ...
+    def __call__(self, x: mx.array) -> mx.array: ...
+
+class Step3p5Attention(nn.Module):
+    is_sliding: bool
+    num_heads: int
+    num_kv_heads: int
+    head_dim: int
+    scale: float
+    q_proj: nn.Linear
+    k_proj: nn.Linear
+    v_proj: nn.Linear
+    o_proj: nn.Linear
+    q_norm: nn.Module
+    k_norm: nn.Module
+    use_head_wise_attn_gate: bool
+    g_proj: nn.Linear
+    rope: nn.Module
+
+    def __init__(self, args: ModelArgs, layer_idx: int) -> None: ...
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array: ...
+
+class Step3p5DecoderLayer(nn.Module):
+    self_attn: Step3p5Attention
+    is_sliding: bool
+    is_moe_layer: bool
+    mlp: Step3p5MLP | Step3p5MoE
+    input_layernorm: nn.Module
+    post_attention_layernorm: nn.Module
+
+    def __init__(self, args: ModelArgs, layer_idx: int) -> None: ...
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array: ...
+
+class Step3p5Model(nn.Module):
+    args: ModelArgs
+    vocab_size: int
+    num_layers: int
+    embed_tokens: nn.Embedding
+    layers: list[Step3p5DecoderLayer]
+    norm: nn.Module
+    _swa_idx: Optional[int]
+    _full_idx: Optional[int]
+
+    def __init__(self, args: ModelArgs) -> None: ...
+    def __call__(
+        self,
+        x: mx.array,
+        cache: Optional[List[Any]] = None,
+    ) -> mx.array: ...
+
+class Model(nn.Module):
+    args: ModelArgs
+    model_type: str
+    model: Step3p5Model
+    lm_head: nn.Linear
+
+    def __init__(self, args: ModelArgs) -> None: ...
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache: Optional[List[Any]] = None,
+    ) -> mx.array: ...
+    def sanitize(self, weights: dict[str, Any]) -> dict[str, Any]: ...
+    def shard(self, group: Optional[mx.distributed.Group] = None) -> None: ...
+    @property
+    def layers(self) -> list[Step3p5DecoderLayer]: ...
+    def make_cache(self) -> list[Any]: ...
+    @property
+    def cast_predicate(self) -> Any: ...
+    @property
+    def quant_predicate(self) -> Any: ...
--- a/e2e/Dockerfile
+++ b/e2e/Dockerfile
@@ -1,53 +0,0 @@
-# Stage 1: Build the dashboard
-FROM node:22-slim AS dashboard
-WORKDIR /app/dashboard
-COPY dashboard/package.json dashboard/package-lock.json ./
-RUN npm ci
-COPY dashboard/ .
-RUN npm run build
-
-# Stage 2: Build and run exo
-FROM python:3.13-slim
-
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    pkg-config \
-    libssl-dev \
-    curl \
-    protobuf-compiler \
-    iptables \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install Rust nightly
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain nightly
-ENV PATH="/root/.cargo/bin:${PATH}"
-
-# Install uv
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
-
-WORKDIR /app
-
-# Copy dependency files first for better layer caching
-COPY pyproject.toml Cargo.toml uv.lock README.md ./
-COPY rust/ ./rust/
-COPY bench/pyproject.toml ./bench/pyproject.toml
-
-# Copy source and resources
-COPY src/ ./src/
-COPY resources/ ./resources/
-
-# Copy built dashboard from stage 1
-COPY --from=dashboard /app/dashboard/build ./dashboard/build/
-
-# Install Python deps and build Rust bindings, then clean up build artifacts
-# to keep the layer small (Rust target/ and cargo registry can be 1-2 GB)
-RUN uv sync && rm -rf /app/rust/target /root/.cargo/registry /root/.cargo/git
-
-# Wrap g++ with -fpermissive to fix MLX CPU JIT compilation with GCC 14
-# (GCC 14 treats _Float128/_Float32/_Float64 as built-in types, conflicting with MLX-generated code)
-RUN mv /usr/bin/g++ /usr/bin/g++.real && \
-    printf '#!/bin/sh\nexec /usr/bin/g++.real -fpermissive "$@"\n' > /usr/bin/g++ && \
-    chmod +x /usr/bin/g++
-
-CMD [".venv/bin/exo", "-v"]
--- a/e2e/conftest.py
+++ b/e2e/conftest.py
@@ -1,182 +0,0 @@
-"""Shared E2E test infrastructure for exo cluster tests."""
-
-import asyncio
-import json
-import os
-import sys
-from pathlib import Path
-from urllib.error import URLError
-from urllib.request import Request, urlopen
-
-E2E_DIR = Path(__file__).parent.resolve()
-TIMEOUT = int(os.environ.get("E2E_TIMEOUT", "120"))
-
-
-class Cluster:
-    """Async wrapper around a docker compose exo cluster."""
-
-    def __init__(self, name: str, overrides: list[str] | None = None):
-        self.name = name
-        self.project = f"e2e-{name}"
-        compose_files = [str(E2E_DIR / "docker-compose.yml")]
-        for path in overrides or []:
-            compose_files.append(str(E2E_DIR / path))
-        self._compose_base = [
-            "docker",
-            "compose",
-            "-p",
-            self.project,
-            *[arg for f in compose_files for arg in ("-f", f)],
-        ]
-
-    async def __aenter__(self):
-        return self
-
-    async def __aexit__(self, *exc):
-        await self.stop()
-
-    async def _run(self, *args: str, check: bool = True) -> str:
-        proc = await asyncio.create_subprocess_exec(
-            *self._compose_base,
-            *args,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.STDOUT,
-        )
-        stdout, _ = await proc.communicate()
-        output = stdout.decode()
-        if check and proc.returncode != 0:
-            print(output, file=sys.stderr)
-            raise RuntimeError(
-                f"docker compose {' '.join(args)} failed (rc={proc.returncode})"
-            )
-        return output
-
-    async def build(self):
-        print("  Building images...")
-        await self._run("build", "--quiet")
-
-    async def start(self):
-        print("  Starting cluster...")
-        await self._run("up", "-d")
-
-    async def stop(self):
-        print("  Cleaning up...")
-        await self._run("down", "--timeout", "5", check=False)
-
-    async def logs(self) -> str:
-        return await self._run("logs", check=False)
-
-    async def exec(
-        self, service: str, *cmd: str, check: bool = True
-    ) -> tuple[int, str]:
-        """Run a command inside a running container. Returns (returncode, output)."""
-        proc = await asyncio.create_subprocess_exec(
-            *self._compose_base,
-            "exec",
-            "-T",
-            service,
-            *cmd,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.STDOUT,
-        )
-        stdout, _ = await proc.communicate()
-        output = stdout.decode()
-        if check and proc.returncode != 0:
-            raise RuntimeError(
-                f"exec {' '.join(cmd)} in {service} failed (rc={proc.returncode})"
-            )
-        return proc.returncode, output
-
-    async def wait_for(self, description: str, check_fn, timeout: int = TIMEOUT):
-        """Poll check_fn every 2s until it returns True or timeout expires."""
-        print(f"  Waiting for {description}...")
-        deadline = asyncio.get_event_loop().time() + timeout
-        while asyncio.get_event_loop().time() < deadline:
-            if await check_fn():
-                print(f"  {description}")
-                return
-            await asyncio.sleep(2)
-        output = await self.logs()
-        print(f"--- cluster logs ---\n{output}\n---", file=sys.stderr)
-        raise TimeoutError(f"Timed out waiting for {description}")
-
-    async def assert_healthy(self):
-        """Verify the cluster formed correctly: nodes started, discovered each other, elected a master, API responds."""
-
-        async def both_nodes_started():
-            log = await self.logs()
-            return log.count("Starting node") >= 2
-
-        async def nodes_discovered():
-            log = await self.logs()
-            return log.count("ConnectionMessageType.Connected") >= 2
-
-        async def master_elected():
-            log = await self.logs()
-            return "demoting self" in log
-
-        async def api_responding():
-            try:
-                with urlopen("http://localhost:52415/v1/models", timeout=3) as resp:
-                    return resp.status == 200
-            except (URLError, OSError):
-                return False
-
-        await self.wait_for("Both nodes started", both_nodes_started)
-        await self.wait_for("Nodes discovered each other", nodes_discovered)
-        await self.wait_for("Master election resolved", master_elected)
-        await self.wait_for("API responding", api_responding)
-
-    async def _api(
-        self, method: str, path: str, body: dict | None = None, timeout: int = 30
-    ) -> dict:
-        """Make an API request to the cluster. Returns parsed JSON."""
-        url = f"http://localhost:52415{path}"
-        data = json.dumps(body).encode() if body else None
-        req = Request(
-            url, data=data, headers={"Content-Type": "application/json"}, method=method
-        )
-        loop = asyncio.get_event_loop()
-        resp_bytes = await loop.run_in_executor(
-            None, lambda: urlopen(req, timeout=timeout).read()
-        )
-        return json.loads(resp_bytes)
-
-    async def place_model(self, model: str, timeout: int = 600):
-        """Place a model instance on the cluster (triggers download) and wait until it's ready."""
-        await self._api("POST", "/place_instance", {"model_id": model})
-
-        async def model_ready():
-            try:
-                resp = await self._api("GET", "/v1/models")
-                return any(m.get("id") == model for m in resp.get("data", []))
-            except Exception:
-                return False
-
-        await self.wait_for(f"Model {model} ready", model_ready, timeout=timeout)
-
-    async def chat(
-        self, model: str, messages: list[dict], timeout: int = 600, **kwargs
-    ) -> dict:
-        """Send a chat completion request. Retries until model is downloaded and inference completes."""
-        body = json.dumps({"model": model, "messages": messages, **kwargs}).encode()
-        deadline = asyncio.get_event_loop().time() + timeout
-        last_error = None
-
-        while asyncio.get_event_loop().time() < deadline:
-            try:
-                req = Request(
-                    "http://localhost:52415/v1/chat/completions",
-                    data=body,
-                    headers={"Content-Type": "application/json"},
-                )
-                loop = asyncio.get_event_loop()
-                resp_bytes = await loop.run_in_executor(
-                    None, lambda r=req: urlopen(r, timeout=300).read()
-                )
-                return json.loads(resp_bytes)
-            except Exception as e:
-                last_error = e
-                await asyncio.sleep(5)
-
-        raise TimeoutError(f"Chat request failed after {timeout}s: {last_error}")
--- a/e2e/docker-compose.yml
+++ b/e2e/docker-compose.yml
@@ -1,18 +0,0 @@
-services:
-  exo-node-1:
-    build:
-      context: ..
-      dockerfile: e2e/Dockerfile
-    environment:
-      - EXO_LIBP2P_NAMESPACE=docker-e2e
-    command: [".venv/bin/exo", "-v"]
-    ports:
-      - "52415:52415"
-
-  exo-node-2:
-    build:
-      context: ..
-      dockerfile: e2e/Dockerfile
-    environment:
-      - EXO_LIBP2P_NAMESPACE=docker-e2e
-    command: [".venv/bin/exo", "-v"]
--- a/e2e/run_all.py
+++ b/e2e/run_all.py
@@ -1,75 +0,0 @@
-#!/usr/bin/env python3
-"""Discovers and runs all E2E tests in e2e/test_*.py.
-
-Tests with '# slow' on the first line of their docstring are skipped
-unless --slow is passed or E2E_SLOW=1 is set.
-"""
-
-import os
-import subprocess
-import sys
-from pathlib import Path
-
-E2E_DIR = Path(__file__).parent.resolve()
-
-
-def is_slow(test_file: Path) -> bool:
-    """Check if the test file is marked as slow (has '# slow' in first 3 lines)."""
-    with open(test_file) as f:
-        for line in f:
-            if line.strip().startswith("#"):
-                continue
-            if line.strip().startswith('"""') or line.strip().startswith("'''"):
-                # Read into the docstring
-                for doc_line in f:
-                    if "slow" in doc_line.lower() and doc_line.strip().startswith(
-                        "slow"
-                    ):
-                        return True
-                    if '"""' in doc_line or "'''" in doc_line:
-                        break
-            break
-    return False
-
-
-def main():
-    run_slow = "--slow" in sys.argv or os.environ.get("E2E_SLOW") == "1"
-    test_files = sorted(E2E_DIR.glob("test_*.py"))
-    if not test_files:
-        print("No test files found")
-        sys.exit(1)
-
-    passed = 0
-    failed = 0
-    skipped = 0
-    failures = []
-
-    for test_file in test_files:
-        name = test_file.stem
-        if is_slow(test_file) and not run_slow:
-            print(f"=== {name} === SKIPPED (slow, use --slow to run)")
-            skipped += 1
-            continue
-
-        print(f"=== {name} ===")
-        result = subprocess.run([sys.executable, str(test_file)])
-        if result.returncode == 0:
-            passed += 1
-        else:
-            failed += 1
-            failures.append(name)
-        print()
-
-    total = passed + failed + skipped
-    print("================================")
-    print(
-        f"{passed}/{total} tests passed" + (f", {skipped} skipped" if skipped else "")
-    )
-
-    if failed:
-        print(f"Failed: {' '.join(failures)}")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
--- a/e2e/snapshots/inference.json
+++ b/e2e/snapshots/inference.json
@@ -1,8 +0,0 @@
-{
-  "model": "mlx-community/Qwen3-0.6B-4bit",
-  "seed": 42,
-  "temperature": 0,
-  "prompt": "What is 2+2? Reply with just the number.",
-  "max_tokens": 32,
-  "content": "<think>\nOkay, so I need to figure out what 2+2 is. Let me think. Well, if you add 2 and 2 together"
-}
--- a/e2e/test_cluster_formation.py
+++ b/e2e/test_cluster_formation.py
@@ -1,22 +0,0 @@
-"""Test: Basic cluster formation.
-
-Verifies two nodes discover each other, elect a master, and the API responds.
-"""
-
-import asyncio
-import sys
-
-sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent))
-from conftest import Cluster
-
-
-async def main():
-    async with Cluster("cluster_formation") as cluster:
-        await cluster.build()
-        await cluster.start()
-        await cluster.assert_healthy()
-        print("PASSED: cluster_formation")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/e2e/test_inference_snapshot.py
+++ b/e2e/test_inference_snapshot.py
@@ -1,82 +0,0 @@
-"""Test: Deterministic inference output (snapshot test).
-slow
-
-Sends a chat completion request with a fixed seed and temperature=0,
-then verifies the output matches a known-good snapshot. This ensures
-inference produces consistent results across runs.
-
-Requires a machine that can run MLX inference at reasonable speed (Apple Silicon).
-Run with: python3 e2e/run_all.py --slow  or  E2E_SLOW=1 python3 e2e/run_all.py
-"""
-
-import asyncio
-import json
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent))
-from conftest import Cluster
-
-MODEL = "mlx-community/Qwen3-0.6B-4bit"
-SEED = 42
-PROMPT = "What is 2+2? Reply with just the number."
-MAX_TOKENS = 32
-SNAPSHOT_FILE = Path(__file__).parent / "snapshots" / "inference.json"
-
-
-async def main():
-    async with Cluster("inference_snapshot") as cluster:
-        await cluster.build()
-        await cluster.start()
-        await cluster.assert_healthy()
-
-        # Launch the model instance (triggers download + placement)
-        print(f"  Launching model {MODEL}...")
-        await cluster.place_model(MODEL)
-
-        print(f"  Sending chat completion (seed={SEED}, temperature=0)...")
-        resp = await cluster.chat(
-            model=MODEL,
-            messages=[{"role": "user", "content": PROMPT}],
-            seed=SEED,
-            temperature=0,
-            max_tokens=MAX_TOKENS,
-        )
-
-        content = resp["choices"][0]["message"]["content"]
-        print(f"  Response: {content!r}")
-
-        # Load or create snapshot
-        if SNAPSHOT_FILE.exists():
-            snapshot = json.loads(SNAPSHOT_FILE.read_text())
-            expected = snapshot["content"]
-            assert content == expected, (
-                f"Snapshot mismatch!\n"
-                f"  Expected: {expected!r}\n"
-                f"  Got:      {content!r}\n"
-                f"  Delete {SNAPSHOT_FILE} to regenerate."
-            )
-            print("  Output matches snapshot")
-        else:
-            SNAPSHOT_FILE.parent.mkdir(parents=True, exist_ok=True)
-            SNAPSHOT_FILE.write_text(
-                json.dumps(
-                    {
-                        "model": MODEL,
-                        "seed": SEED,
-                        "temperature": 0,
-                        "prompt": PROMPT,
-                        "max_tokens": MAX_TOKENS,
-                        "content": content,
-                    },
-                    indent=2,
-                )
-                + "\n"
-            )
-            print(f"  Snapshot created: {SNAPSHOT_FILE}")
-
-        print("PASSED: inference_snapshot")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/e2e/test_no_internet.py
+++ b/e2e/test_no_internet.py
@@ -1,47 +0,0 @@
-"""Test: Cluster works without internet access.
-
-Verifies exo functions correctly when containers can talk to each other
-but cannot reach the internet. Uses iptables to block all outbound traffic
-except private subnets and multicast (for mDNS discovery).
-"""
-
-import asyncio
-import sys
-
-sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent))
-from conftest import Cluster
-
-
-async def main():
-    async with Cluster(
-        "no_internet",
-        overrides=["tests/no_internet/docker-compose.override.yml"],
-    ) as cluster:
-        await cluster.build()
-        await cluster.start()
-        await cluster.assert_healthy()
-
-        # Verify internet is actually blocked from inside the containers
-        for node in ["exo-node-1", "exo-node-2"]:
-            rc, _ = await cluster.exec(
-                node,
-                "curl",
-                "-sf",
-                "--max-time",
-                "3",
-                "https://huggingface.co",
-                check=False,
-            )
-            assert rc != 0, f"{node} should not be able to reach the internet"
-            print(f"  {node}: internet correctly blocked")
-
-        # Verify exo detected no internet connectivity
-        log = await cluster.logs()
-        assert "Internet connectivity: False" in log, "exo should detect no internet"
-        print("  exo correctly detected no internet connectivity")
-
-        print("PASSED: no_internet")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/e2e/tests/no_internet/docker-compose.override.yml
+++ b/e2e/tests/no_internet/docker-compose.override.yml
@@ -1,32 +0,0 @@
-# Block all outbound internet traffic using iptables while preserving:
-#   - Multicast (224.0.0.0/4) for mDNS peer discovery
-#   - Private subnets (10/8, 172.16/12, 192.168/16) for inter-container communication
-#   - Loopback (127/8)
-# Requires NET_ADMIN capability for iptables.
-services:
-  exo-node-1:
-    cap_add:
-      - NET_ADMIN
-    entrypoint: ["/bin/sh", "-c"]
-    command:
-      - |
-        iptables -A OUTPUT -d 127.0.0.0/8 -j ACCEPT
-        iptables -A OUTPUT -d 10.0.0.0/8 -j ACCEPT
-        iptables -A OUTPUT -d 172.16.0.0/12 -j ACCEPT
-        iptables -A OUTPUT -d 192.168.0.0/16 -j ACCEPT
-        iptables -A OUTPUT -d 224.0.0.0/4 -j ACCEPT
-        iptables -A OUTPUT -j REJECT
-        exec .venv/bin/exo -v
-  exo-node-2:
-    cap_add:
-      - NET_ADMIN
-    entrypoint: ["/bin/sh", "-c"]
-    command:
-      - |
-        iptables -A OUTPUT -d 127.0.0.0/8 -j ACCEPT
-        iptables -A OUTPUT -d 10.0.0.0/8 -j ACCEPT
-        iptables -A OUTPUT -d 172.16.0.0/12 -j ACCEPT
-        iptables -A OUTPUT -d 192.168.0.0/16 -j ACCEPT
-        iptables -A OUTPUT -d 224.0.0.0/4 -j ACCEPT
-        iptables -A OUTPUT -j REJECT
-        exec .venv/bin/exo -v
--- a/resources/inference_model_cards/mlx-community--Step-3.5-Flash-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Step-3.5-Flash-4bit.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/Step-3.5-Flash-4bit"
+n_layers = 45
+hidden_size = 4096
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "step"
+quantization = "4bit"
+base_model = "Step 3.5 Flash"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 114572190076
--- a/resources/inference_model_cards/mlx-community--Step-3.5-Flash-6bit.toml
+++ b/resources/inference_model_cards/mlx-community--Step-3.5-Flash-6bit.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/Step-3.5-Flash-6bit"
+n_layers = 45
+hidden_size = 4096
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "step"
+quantization = "6bit"
+base_model = "Step 3.5 Flash"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 159039627774
--- a/resources/inference_model_cards/mlx-community--Step-3.5-Flash-8Bit.toml
+++ b/resources/inference_model_cards/mlx-community--Step-3.5-Flash-8Bit.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/Step-3.5-Flash-8Bit"
+n_layers = 45
+hidden_size = 4096
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "step"
+quantization = "8bit"
+base_model = "Step 3.5 Flash"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 209082699847
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -189,6 +189,7 @@ class ConfigData(BaseModel):
            ["MiniMaxM2ForCausalLM"],
            ["LlamaForCausalLM"],
            ["GptOssForCausalLM"],
+            ["Step3p5ForCausalLM"],
        ]

    @model_validator(mode="before")
--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -35,6 +35,9 @@ from mlx_lm.models.qwen3_moe import Model as Qwen3MoeModel
 from mlx_lm.models.qwen3_moe import Qwen3MoeSparseMoeBlock
 from mlx_lm.models.qwen3_next import Model as Qwen3NextModel
 from mlx_lm.models.qwen3_next import Qwen3NextDecoderLayer, Qwen3NextSparseMoeBlock
+from mlx_lm.models.step3p5 import Model as Step35Model
+from mlx_lm.models.step3p5 import Step3p5MLP as Step35MLP
+from mlx_lm.models.step3p5 import Step3p5Model as Step35InnerModel
 from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer

 from exo.shared.logging import logger
@@ -264,6 +267,19 @@ def pipeline_auto_parallel(
            )
        )

+    if isinstance(inner_model_instance, Step35InnerModel):
+        inner_model_instance.num_layers = len(layers)
+        sliding_layers = [
+            i for i, layer in enumerate(layers) if getattr(layer, "is_sliding", False)
+        ]
+        full_layers = [
+            i
+            for i, layer in enumerate(layers)
+            if not getattr(layer, "is_sliding", True)
+        ]
+        inner_model_instance._swa_idx = 0 if not sliding_layers else sliding_layers[0]
+        inner_model_instance._full_idx = 0 if not full_layers else full_layers[0]
+
    _set_layers(model, layers)

    assert isinstance(layers, list), (
@@ -427,6 +443,14 @@ def tensor_auto_parallel(
            all_to_sharded_linear_in_place,
            sharded_to_all_linear_in_place,
        )
+    elif isinstance(model, Step35Model):
+        tensor_parallel_sharding_strategy = Step35ShardingStrategy(
+            group,
+            all_to_sharded_linear,
+            sharded_to_all_linear,
+            all_to_sharded_linear_in_place,
+            sharded_to_all_linear_in_place,
+        )
    else:
        raise ValueError(f"Unsupported model type: {type(model)}")

@@ -981,3 +1005,46 @@ class GptOssShardingStrategy(TensorParallelShardingStrategy):
            layer.mlp.sharding_group = self.group  # pyright: ignore[reportAttributeAccessIssue]
            mx.eval(layer)
        return model
+
+
+class Step35ShardingStrategy(TensorParallelShardingStrategy):
+    def shard_model(
+        self,
+        model: nn.Module,
+        timeout_seconds: float,
+        on_timeout: TimeoutCallback | None,
+    ) -> nn.Module:
+        model = cast(Step35Model, model)
+
+        for layer in model.layers:
+            eval_with_timeout(
+                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
+            )
+            layer.self_attn.q_proj = self.all_to_sharded_linear(layer.self_attn.q_proj)
+            layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
+            layer.self_attn.v_proj = self.all_to_sharded_linear(layer.self_attn.v_proj)
+            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
+
+            layer.self_attn.num_heads //= self.N
+            layer.self_attn.num_kv_heads //= self.N
+
+            if getattr(layer.self_attn, "use_head_wise_attn_gate", False):
+                layer.self_attn.g_proj = self.all_to_sharded_linear(
+                    layer.self_attn.g_proj
+                )
+
+            if isinstance(layer.mlp, Step35MLP):
+                layer.mlp.gate_proj = self.all_to_sharded_linear(layer.mlp.gate_proj)
+                layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)
+                layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
+            else:
+                layer.mlp.sharding_group = self.group
+                self.all_to_sharded_linear_in_place(layer.mlp.share_expert.gate_proj)
+                self.all_to_sharded_linear_in_place(layer.mlp.share_expert.up_proj)
+                self.sharded_to_all_linear_in_place(layer.mlp.share_expert.down_proj)
+                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
+                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.up_proj)
+                self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
+
+            mx.eval(layer)
+        return model