tmp changes - DONT PUSH

.
Pass CI
2026-02-23 17:58:36 -05:00 · 2026-02-23 21:12:00 +00:00 · 2026-02-23 20:55:54 +00:00 · 2026-02-23 20:41:31 +00:00 · 2026-02-23 20:35:30 +00:00 · 2026-02-23 20:07:10 +00:00
7 changed files with 146 additions and 12 deletions
--- a/src/exo/main.py
+++ b/src/exo/main.py
@@ -252,7 +252,7 @@ def main():
    target = min(max(soft, 65535), hard)
    resource.setrlimit(resource.RLIMIT_NOFILE, (target, hard))

-    mp.set_start_method("spawn")
+    mp.set_start_method("spawn", force=True)
    # TODO: Refactor the current verbosity system
    logger_setup(EXO_LOG, args.verbosity)
    logger.info("Starting EXO")
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -90,6 +90,7 @@ class ModelCard(CamelCaseModel):
    base_model: str = ""
    capabilities: list[str] = []
    uses_cfg: bool = False
+    trust_remote_code: bool = True

    @field_validator("tasks", mode="before")
    @classmethod
@@ -137,6 +138,7 @@ class ModelCard(CamelCaseModel):
            hidden_size=config_data.hidden_size or 0,
            supports_tensor=config_data.supports_tensor,
            tasks=[ModelTask.TextGeneration],
+            trust_remote_code=False,
        )
        await mc.save_to_custom_dir()
        _card_cache[model_id] = mc
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -23,9 +23,7 @@ from mlx_lm.models.deepseek_v3 import DeepseekV3Model
 from mlx_lm.tokenizer_utils import TokenizerWrapper

 from exo.shared.models.model_cards import ModelId
-from exo.worker.engines.mlx.constants import (
-    TRUST_REMOTE_CODE,
-)
+from exo.worker.engines.mlx.constants import TRUST_REMOTE_CODE

 try:
    from mlx_lm.tokenizer_utils import load_tokenizer
@@ -293,7 +291,11 @@ def shard_and_load(

 def get_tokenizer(model_path: Path, shard_metadata: ShardMetadata) -> TokenizerWrapper:
    """Load tokenizer for a model shard. Delegates to load_tokenizer_for_model_id."""
-    return load_tokenizer_for_model_id(shard_metadata.model_card.model_id, model_path)
+    return load_tokenizer_for_model_id(
+        shard_metadata.model_card.model_id,
+        model_path,
+        trust_remote_code=shard_metadata.model_card.trust_remote_code,
+    )


 def get_eos_token_ids_for_model(model_id: ModelId) -> list[int] | None:
@@ -325,7 +327,7 @@ def get_eos_token_ids_for_model(model_id: ModelId) -> list[int] | None:


 def load_tokenizer_for_model_id(
-    model_id: ModelId, model_path: Path
+    model_id: ModelId, model_path: Path, *, trust_remote_code: bool = TRUST_REMOTE_CODE
 ) -> TokenizerWrapper:
    """
    Load tokenizer for a model given its ID and local path.
@@ -394,7 +396,7 @@ def load_tokenizer_for_model_id(

    tokenizer = load_tokenizer(
        model_path,
-        tokenizer_config_extra={"trust_remote_code": TRUST_REMOTE_CODE},
+        tokenizer_config_extra={"trust_remote_code": trust_remote_code},
        eos_token_ids=eos_token_ids,
    )

--- a/src/exo/worker/runner/runner_supervisor.py
+++ b/src/exo/worker/runner/runner_supervisor.py
@@ -106,13 +106,18 @@ class RunnerSupervisor:
    def shutdown(self):
        logger.info("Runner supervisor shutting down")
        self._tg.cancel_tasks()
-        self._ev_recv.close()
-        self._task_sender.close()
        if not self._cancel_watch_runner.cancel_called:
            self._cancel_watch_runner.cancel()
+        with contextlib.suppress(ClosedResourceError):
+            self._ev_recv.close()
+        with contextlib.suppress(ClosedResourceError):
+            self._task_sender.close()
+        with contextlib.suppress(ClosedResourceError):
+            self._event_sender.close()
        with contextlib.suppress(ClosedResourceError):
            self._cancel_sender.send(TaskId("CANCEL_CURRENT_TASK"))
-        self._cancel_sender.close()
+        with contextlib.suppress(ClosedResourceError):
+            self._cancel_sender.close()
        self.runner_process.join(5)
        if not self.runner_process.is_alive():
            logger.info("Runner process succesfully terminated")
--- a/tests/get_all_models_on_cluster.py
+++ b/tests/get_all_models_on_cluster.py
@@ -8,7 +8,7 @@ from urllib.request import urlopen

 h = sys.argv[1] if len(sys.argv) > 1 else sys.exit(f"USAGE: {sys.argv[0]} host")
 ts = subprocess.run(
-    ["tailscale", "status"], check=True, text=True, capture_output=True
+    ["/Applications/Tailscale.app/Contents/MacOS/Tailscale", "status"], check=True, text=True, capture_output=True
 ).stdout.splitlines()
 ip = next(
    (sl[0] for line in ts if len(sl := line.split()) >= 2 if sl[1] == h), None
--- a/tests/start_distributed_test.py
+++ b/tests/start_distributed_test.py
@@ -15,7 +15,7 @@ if not (args := sys.argv[1:]):
 kind = args[0] if args[0] in ("jaccl", "ring") else "both"
 hosts = args[1:] if kind != "both" else args
 ts = subprocess.run(
-    ["tailscale", "status"], check=True, text=True, capture_output=True
+    ["/Applications/Tailscale.app/Contents/MacOS/Tailscale", "status"], check=True, text=True, capture_output=True
 ).stdout.splitlines()
 ip = {sl[1]: sl[0] for line in ts if len(sl := line.split()) >= 2}
 ips = [ip[h] for h in hosts]
--- a/tmp/test_trust_remote_code_attack.sh
+++ b/tmp/test_trust_remote_code_attack.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+# Test that models added via API get trust_remote_code=false
+# Run this against a running exo instance.
+# Usage: ./test_trust_remote_code_attack.sh [host:port]
+
+set -uo pipefail
+
+HOST="${1:-localhost:52415}"
+MODEL_ID="KevTheHermit/security-testing"
+CUSTOM_CARDS_DIR="$HOME/.exo/custom_model_cards"
+CARD_FILE="$CUSTOM_CARDS_DIR/KevTheHermit--security-testing.toml"
+
+echo "=== Test: trust_remote_code attack via API ==="
+echo "Target: $HOST"
+echo ""
+
+# Clean up RCE proof from previous runs
+rm -f /tmp/exo-rce-proof.txt
+
+# Step 0: Clean up any stale card from previous runs
+if [ -f "$CARD_FILE" ]; then
+  echo "[0] Removing stale card from previous run ..."
+  curl -s -X DELETE \
+    "http://$HOST/models/custom/$(python3 -c 'import urllib.parse; print(urllib.parse.quote("'"$MODEL_ID"'", safe=""))')" >/dev/null
+  rm -f "$CARD_FILE"
+  echo "    Done"
+  echo ""
+fi
+
+# Step 1: Add the malicious model via API
+echo "[1] Adding model via POST /models/add ..."
+ADD_RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "http://$HOST/models/add" \
+  -H "Content-Type: application/json" \
+  -d "{\"model_id\":\"$MODEL_ID\"}")
+HTTP_CODE=$(echo "$ADD_RESPONSE" | tail -1)
+BODY=$(echo "$ADD_RESPONSE" | sed '$d')
+echo "    HTTP $HTTP_CODE"
+
+if [ "$HTTP_CODE" -ge 400 ]; then
+  echo "    Model add failed (HTTP $HTTP_CODE) — that's fine if model doesn't exist on HF."
+  echo "    Response: $BODY"
+  echo ""
+  echo "RESULT: Model was rejected at add time. Attack blocked."
+  exit 0
+fi
+
+# Step 2: Verify the saved TOML has trust_remote_code = false
+echo ""
+echo "[2] Checking saved model card TOML ..."
+if [ ! -f "$CARD_FILE" ]; then
+  echo "    FAIL: Card file not found at $CARD_FILE"
+  exit 1
+fi
+
+if grep -q 'trust_remote_code = false' "$CARD_FILE"; then
+  echo "    SAFE: trust_remote_code = false (fix is active)"
+else
+  echo "    VULNERABLE: trust_remote_code is not false — remote code WILL be trusted"
+fi
+echo "    Contents:"
+cat "$CARD_FILE"
+
+# Step 3: Place the instance
+echo ""
+echo "[3] Attempting POST /place_instance ..."
+PLACE_RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "http://$HOST/place_instance" \
+  -H "Content-Type: application/json" \
+  -d "{\"model_id\":\"$MODEL_ID\"}")
+PLACE_CODE=$(echo "$PLACE_RESPONSE" | tail -1)
+PLACE_BODY=$(echo "$PLACE_RESPONSE" | sed '$d')
+echo "    HTTP $PLACE_CODE"
+echo "    Response: $PLACE_BODY"
+
+# Step 3b: Send a chat completion to actually trigger tokenizer loading
+echo ""
+echo "[3b] Sending chat completion to trigger tokenizer load ..."
+CHAT_RESPONSE=$(curl -s -w "\n%{http_code}" --max-time 30 -X POST "http://$HOST/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d "{\"model\":\"$MODEL_ID\",\"messages\":[{\"role\":\"user\",\"content\":\"hello\"}],\"max_tokens\":1}")
+CHAT_CODE=$(echo "$CHAT_RESPONSE" | tail -1)
+CHAT_BODY=$(echo "$CHAT_RESPONSE" | sed '$d')
+echo "    HTTP $CHAT_CODE"
+echo "    Response: $CHAT_BODY"
+echo ""
+echo "[3c] Checking for RCE proof ..."
+sleep 5
+if [ -f /tmp/exo-rce-proof.txt ]; then
+  echo "    VULNERABLE: Remote code executed!"
+  echo "    Contents:"
+  cat /tmp/exo-rce-proof.txt
+else
+  echo "    SAFE: /tmp/exo-rce-proof.txt does not exist — remote code was NOT executed"
+fi
+
+# Step 4: Clean up — delete instance and custom model
+echo ""
+echo "[4] Cleaning up ..."
+
+# Find and delete any instance for this model
+INSTANCE_ID=$(curl -s "http://$HOST/state" | python3 -c "
+import sys, json
+state = json.load(sys.stdin)
+for iid, wrapper in state.get('instances', {}).items():
+    for tag, inst in wrapper.items():
+        sa = inst.get('shardAssignments', {})
+        if sa.get('modelId', '') == '$MODEL_ID':
+            print(iid)
+            sys.exit(0)
+" 2>/dev/null || true)
+
+if [ -n "$INSTANCE_ID" ]; then
+  echo "    Deleting instance $INSTANCE_ID ..."
+  curl -s -X DELETE "http://$HOST/instance/$INSTANCE_ID" >/dev/null
+  echo "    Done"
+else
+  echo "    No instance found to delete"
+fi
+
+echo "    Deleting custom model card ..."
+curl -s -X DELETE \
+  "http://$HOST/models/custom/$(python3 -c 'import urllib.parse; print(urllib.parse.quote("'"$MODEL_ID"'", safe=""))')" >/dev/null
+echo "    Done"
+
+echo ""
+echo "=== DONE ==="
Author	SHA1	Message	Date
Ryuichi Leo Takashige	e7ce42afc8	tmp changes - DONT PUSH	2026-02-23 21:12:00 +00:00
Ryuichi Leo Takashige	140d0bf6e6	.	2026-02-23 20:55:54 +00:00
Ryuichi Leo Takashige	912b77bd18	Pass CI	2026-02-23 20:41:31 +00:00
Ryuichi Leo Takashige	0aff25d251	Reproduce remote code attack and fix	2026-02-23 20:35:30 +00:00
Ryuichi Leo Takashige	b48f3c530a	n_heads for qwen3	2026-02-23 20:07:10 +00:00
Alex Cheema	c90a0cec78	fix: suppress closure errors in runnersupervisor and force spawn start method (#1547 ) some errors could be thrown during shutdown - we can dismiss these safely co-authored by me :)	2026-02-23 18:30:41 +00:00