From 3894cf134edc91ce34b4bbefc6d6e96afedd2597 Mon Sep 17 00:00:00 2001
From: rltakashige <rl.takashige@gmail.com>
Date: Thu, 23 Apr 2026 02:50:39 +0100
Subject: [PATCH] Fix Gemma 4 E2B TP + DeepSeek V32 thinking parsing (#1967)

---
 src/exo/worker/engines/mlx/auto_parallel.py   |   8 +-
 src/exo/worker/engines/mlx/dsml_encoding.py   |  13 ++-
 src/exo/worker/engines/mlx/utils_mlx.py       |   1 -
 .../llm_inference/model_output_parsers.py     |  36 ++----
 .../tests/unittests/test_mlx/conftest.py      |  14 ++-
 .../test_pipeline_prefill_callbacks.py        |   7 +-
 .../unittests/test_runner/test_dsml_e2e.py    |  36 ++++--
 .../test_runner/test_finish_reason_sse.py     | 104 ++++++++++++++++++
 uv.lock                                       |  32 ++++--
 9 files changed, 198 insertions(+), 53 deletions(-)

diff --git a/src/exo/worker/engines/mlx/auto_parallel.py b/src/exo/worker/engines/mlx/auto_parallel.py
index 443979f19..9df66b776 100644
--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -1370,9 +1370,11 @@ class Gemma4ShardingStrategy(TensorParallelShardingStrategy):
 
             attn = layer.self_attn
             attn.q_proj = self.all_to_sharded_linear(attn.q_proj)
-            attn.k_proj = self.all_to_sharded_linear(attn.k_proj)
-            if not attn.use_k_eq_v:
-                attn.v_proj = self.all_to_sharded_linear(attn.v_proj)
+            has_kv: bool = cast(bool, attn.has_kv)
+            if has_kv:
+                attn.k_proj = self.all_to_sharded_linear(attn.k_proj)
+                if not attn.use_k_eq_v:
+                    attn.v_proj = self.all_to_sharded_linear(attn.v_proj)
             attn.o_proj = self.sharded_to_all_linear(attn.o_proj)
             attn.n_heads //= self.N
             attn.n_kv_heads //= self.N
diff --git a/src/exo/worker/engines/mlx/dsml_encoding.py b/src/exo/worker/engines/mlx/dsml_encoding.py
index 9d1dfdd27..66de78c89 100644
--- a/src/exo/worker/engines/mlx/dsml_encoding.py
+++ b/src/exo/worker/engines/mlx/dsml_encoding.py
@@ -17,6 +17,13 @@ TOOL_CALLS_START = f"<{DSML_TOKEN}function_calls>"
 TOOL_CALLS_END = f"</{DSML_TOKEN}function_calls>"
 _ORPHAN_THINK_END = ASSISTANT_TOKEN + THINKING_END
 _FIXED_THINK_BLOCK = ASSISTANT_TOKEN + THINKING_START + "\n" + THINKING_END
+_FUNCTION_RESULTS_CLOSE = "</function_results>"
+_ORPHAN_TOOL_RESULT_SUFFIX = _FUNCTION_RESULTS_CLOSE + "\n\n" + THINKING_END
+_EMPTY_THINK_BLOCKS = (
+    THINKING_START + "\n\n" + THINKING_END,
+    THINKING_START + "\n" + THINKING_END,
+    THINKING_START + THINKING_END,
+)
 
 
 def encode_messages(
@@ -35,7 +42,11 @@ def encode_messages(
         add_default_bos_token=add_default_bos_token,
         tools=tools,
     )
-    return prompt.replace(_ORPHAN_THINK_END, _FIXED_THINK_BLOCK)
+    prompt = prompt.replace(_ORPHAN_TOOL_RESULT_SUFFIX, _FUNCTION_RESULTS_CLOSE)
+    prompt = prompt.replace(_ORPHAN_THINK_END, _FIXED_THINK_BLOCK)
+    for empty in _EMPTY_THINK_BLOCKS:
+        prompt = prompt.replace(empty, "")
+    return prompt
 
 
 _INVOKE_PATTERN = re.compile(
diff --git a/src/exo/worker/engines/mlx/utils_mlx.py b/src/exo/worker/engines/mlx/utils_mlx.py
index 1a16e26d0..c884c1062 100644
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -547,7 +547,6 @@ def render_chat_template(
         )
         if partial_assistant_content:
             prompt += partial_assistant_content
-        logger.info(prompt)
         return prompt
 
     for msg in formatted_messages:
diff --git a/src/exo/worker/runner/llm_inference/model_output_parsers.py b/src/exo/worker/runner/llm_inference/model_output_parsers.py
index 906a26558..944b536b5 100644
--- a/src/exo/worker/runner/llm_inference/model_output_parsers.py
+++ b/src/exo/worker/runner/llm_inference/model_output_parsers.py
@@ -79,6 +79,13 @@ def apply_all_parsers(
         issubclass(model_type, DeepseekV32Model)
         and "deepseek" in model_id.normalize().lower()
     ):
+        if tokenizer.has_thinking:
+            generator = parse_thinking_models(
+                generator,
+                tokenizer.think_start,
+                tokenizer.think_end,
+                starts_in_thinking=detect_thinking_prompt_suffix(prompt, tokenizer),
+            )
         generator = parse_deepseek_v32(generator)
     else:
         if tokenizer.has_thinking:
@@ -210,11 +217,10 @@ def parse_deepseek_v32(
 
     Uses accumulated-text matching (not per-token marker checks) because
     DSML markers like <｜DSML｜function_calls> may span multiple tokens.
-    Also handles <think>...</think> blocks for thinking mode.
+    Thinking tag handling is delegated to parse_thinking_models, which
+    wraps this parser in apply_all_parsers.
     """
     from exo.worker.engines.mlx.dsml_encoding import (
-        THINKING_END,
-        THINKING_START,
         TOOL_CALLS_END,
         TOOL_CALLS_START,
         parse_dsml_output,
@@ -222,7 +228,6 @@ def parse_deepseek_v32(
 
     accumulated = ""
     in_tool_call = False
-    thinking = False
     # Tokens buffered while we detect the start of a DSML block
     pending_buffer: list[GenerationResponse] = []
     # Text accumulated during a tool call block
@@ -264,29 +269,6 @@ def parse_deepseek_v32(
                 yield response
             break
 
-        # ── Handle thinking tags ──
-        if not thinking and THINKING_START in response.text:
-            thinking = True
-            # Yield any text before the <think> tag
-            before = response.text[: response.text.index(THINKING_START)]
-            if before:
-                yield response.model_copy(update={"text": before})
-            continue
-
-        if thinking and THINKING_END in response.text:
-            thinking = False
-            # Yield any text after the </think> tag
-            after = response.text[
-                response.text.index(THINKING_END) + len(THINKING_END) :
-            ]
-            if after:
-                yield response.model_copy(update={"text": after, "is_thinking": False})
-            continue
-
-        if thinking:
-            yield response.model_copy(update={"is_thinking": True})
-            continue
-
         # ── Handle tool call accumulation ──
         if in_tool_call:
             tool_call_text += response.text
diff --git a/src/exo/worker/tests/unittests/test_mlx/conftest.py b/src/exo/worker/tests/unittests/test_mlx/conftest.py
index 98bd46946..a4d711393 100644
--- a/src/exo/worker/tests/unittests/test_mlx/conftest.py
+++ b/src/exo/worker/tests/unittests/test_mlx/conftest.py
@@ -96,7 +96,12 @@ def run_gpt_oss_pipeline_device(
             n_layers=24,
         )
 
-        model, tokenizer = shard_and_load(shard_meta, group, on_layer_loaded=None)
+        gen = shard_and_load(shard_meta, group)
+        try:
+            while True:
+                next(gen)
+        except StopIteration as stop:
+            model, tokenizer = stop.value
         model = cast(Model, model)
 
         # Generate a prompt of exact token length
@@ -172,7 +177,12 @@ def run_gpt_oss_tensor_parallel_device(
             n_layers=24,
         )
 
-        model, tokenizer = shard_and_load(shard_meta, group, on_layer_loaded=None)
+        gen = shard_and_load(shard_meta, group)
+        try:
+            while True:
+                next(gen)
+        except StopIteration as stop:
+            model, tokenizer = stop.value
         model = cast(Model, model)
 
         base_text = "The quick brown fox jumps over the lazy dog. "
diff --git a/src/exo/worker/tests/unittests/test_mlx/test_pipeline_prefill_callbacks.py b/src/exo/worker/tests/unittests/test_mlx/test_pipeline_prefill_callbacks.py
index 161ad7d76..1972abee3 100644
--- a/src/exo/worker/tests/unittests/test_mlx/test_pipeline_prefill_callbacks.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_pipeline_prefill_callbacks.py
@@ -174,7 +174,12 @@ def _run_pipeline_device(
             n_layers=TOTAL_LAYERS,
         )
 
-        model, tokenizer = shard_and_load(shard_meta, group, on_layer_loaded=None)
+        gen = shard_and_load(shard_meta, group)
+        try:
+            while True:
+                next(gen)
+        except StopIteration as stop:
+            model, tokenizer = stop.value
         model = cast(Any, model)
 
         prompt, task = _build_prompt(tokenizer, prompt_tokens)
diff --git a/src/exo/worker/tests/unittests/test_runner/test_dsml_e2e.py b/src/exo/worker/tests/unittests/test_runner/test_dsml_e2e.py
index 74efbef1d..8b6c6ded8 100644
--- a/src/exo/worker/tests/unittests/test_runner/test_dsml_e2e.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_dsml_e2e.py
@@ -20,7 +20,25 @@ from exo.worker.engines.mlx.dsml_encoding import (
     encode_messages,
     parse_dsml_output,
 )
-from exo.worker.runner.llm_inference.model_output_parsers import parse_deepseek_v32
+from exo.worker.runner.llm_inference.model_output_parsers import (
+    parse_deepseek_v32,
+    parse_thinking_models,
+)
+
+
+def _parse_deepseek_with_thinking(
+    source: Generator[GenerationResponse | None],
+    starts_in_thinking: bool = False,
+) -> Generator[GenerationResponse | ToolCallResponse | None]:
+    return parse_deepseek_v32(
+        parse_thinking_models(
+            source,
+            think_start=THINKING_START,
+            think_end=THINKING_END,
+            starts_in_thinking=starts_in_thinking,
+        )
+    )
+
 
 # ── Shared fixtures ──────────────────────────────────────────────
 
@@ -333,9 +351,7 @@ class TestE2EThinkingAndToolCall:
         assert prompt.endswith(THINKING_START)
 
         # Simulate: model outputs <think>, thinks, closes thinking, then tool call.
-        # In the full pipeline, parse_thinking_models handles the case where
-        # <think> is in the prompt. Here we test parse_deepseek_v32 directly,
-        # which detects <think>/<think> markers in the stream.
+        # Use the full production chain (parse_thinking_models → parse_deepseek_v32).
         model_tokens = [
             THINKING_START,
             "The user wants weather",
@@ -353,7 +369,7 @@ class TestE2EThinkingAndToolCall:
             TOOL_CALLS_END,
         ]
 
-        results = list(parse_deepseek_v32(_simulate_tokens(model_tokens)))
+        results = list(_parse_deepseek_with_thinking(_simulate_tokens(model_tokens)))
 
         gen_results = [r for r in results if isinstance(r, GenerationResponse)]
         tool_results = [r for r in results if isinstance(r, ToolCallResponse)]
@@ -387,7 +403,7 @@ class TestE2EThinkingAndToolCall:
         prompt_no_think = encode_messages(
             messages, tools=_WEATHER_TOOLS, thinking_mode="chat"
         )
-        assert prompt_no_think.endswith(THINKING_END)
+        assert not prompt_no_think.endswith(THINKING_START)
 
         # Both should have the same tool definitions
         assert "get_weather" in prompt_think
@@ -597,7 +613,9 @@ class TestE2EFullRoundTrip:
             f"</{DSML_TOKEN}invoke>\n",
             TOOL_CALLS_END,
         ]
-        results_1 = list(parse_deepseek_v32(_simulate_tokens(model_tokens_1)))
+        results_1 = list(
+            _parse_deepseek_with_thinking(_simulate_tokens(model_tokens_1))
+        )
 
         # Verify: thinking tokens + tool call
         gen_1 = [r for r in results_1 if isinstance(r, GenerationResponse)]
@@ -660,7 +678,9 @@ class TestE2EFullRoundTrip:
             THINKING_END,
             "The weather in Hangzhou is currently cloudy with temperatures between 7°C and 13°C.",
         ]
-        results_2 = list(parse_deepseek_v32(_simulate_tokens(model_tokens_2)))
+        results_2 = list(
+            _parse_deepseek_with_thinking(_simulate_tokens(model_tokens_2))
+        )
 
         gen_2 = [r for r in results_2 if isinstance(r, GenerationResponse)]
         tool_2 = [r for r in results_2 if isinstance(r, ToolCallResponse)]
diff --git a/src/exo/worker/tests/unittests/test_runner/test_finish_reason_sse.py b/src/exo/worker/tests/unittests/test_runner/test_finish_reason_sse.py
index 90afe7f4f..6771b3d8f 100644
--- a/src/exo/worker/tests/unittests/test_runner/test_finish_reason_sse.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_finish_reason_sse.py
@@ -380,6 +380,110 @@ class TestGenericToolCallsFinishReason:
 # ── Double parser chain (parse_thinking_models → parse_deepseek_v32) ──
 
 
+class TestDeepSeekV32StartsInThinking:
+    """Regression tests for deepseek v3.2 where the chat template appends
+    <think> to the prompt so the model starts already inside a thinking block.
+    """
+
+    def test_reasoning_tagged_when_starts_in_thinking(self):
+        tokens = [
+            _make_response("let me", 0),
+            _make_response(" think", 1),
+            _make_response(THINKING_END, 2),
+            _make_response("\n", 3),
+            _make_response("42", 4, finish_reason="stop"),
+        ]
+        thinking = parse_thinking_models(
+            _queue_source(tokens),
+            think_start=THINKING_START,
+            think_end=THINKING_END,
+            starts_in_thinking=True,
+        )
+        results = _step_until_finish(parse_deepseek_v32(thinking))
+        gens = [
+            r
+            for r in results
+            if isinstance(r, GenerationResponse) and r.finish_reason is None
+        ]
+        texts = [(r.text, r.is_thinking) for r in gens]
+        assert texts == [("let me", True), (" think", True), ("\n", False)]
+        final = [
+            r
+            for r in results
+            if isinstance(r, GenerationResponse) and r.finish_reason is not None
+        ]
+        assert len(final) == 1
+        assert final[0].text == "42"
+        assert final[0].is_thinking is False
+
+    def test_starts_in_thinking_then_tool_call(self):
+        tokens = [
+            _make_response("need weather", 0),
+            _make_response(THINKING_END, 1),
+            _make_response("\n\n", 2),
+            _make_response(TOOL_CALLS_START, 3),
+            _make_response("\n", 4),
+            _make_response(f'<{DSML_TOKEN}invoke name="get_weather">\n', 5),
+            _make_response(
+                f'<{DSML_TOKEN}parameter name="city" string="true">NYC</{DSML_TOKEN}parameter>\n',
+                6,
+            ),
+            _make_response(f"</{DSML_TOKEN}invoke>\n", 7),
+            _make_response(TOOL_CALLS_END, 8, finish_reason="stop"),
+        ]
+        thinking = parse_thinking_models(
+            _queue_source(tokens),
+            think_start=THINKING_START,
+            think_end=THINKING_END,
+            starts_in_thinking=True,
+        )
+        results = _step_until_finish(parse_deepseek_v32(thinking))
+        reasoning_gens = [
+            r
+            for r in results
+            if isinstance(r, GenerationResponse)
+            and r.finish_reason is None
+            and r.is_thinking
+        ]
+        assert [r.text for r in reasoning_gens] == ["need weather"]
+        tool_results = [r for r in results if isinstance(r, ToolCallResponse)]
+        assert len(tool_results) == 1
+        assert tool_results[0].tool_calls[0].name == "get_weather"
+
+    def test_reasoning_tokens_counted_starts_in_thinking(self):
+        usage = Usage(
+            prompt_tokens=10,
+            completion_tokens=5,
+            total_tokens=15,
+            prompt_tokens_details=PromptTokensDetails(cached_tokens=0),
+            completion_tokens_details=CompletionTokensDetails(reasoning_tokens=0),
+        )
+        tokens = [
+            _make_response("reasoning", 0),
+            _make_response(" more", 1),
+            _make_response(THINKING_END, 2),
+            _make_response("\n", 3),
+            GenerationResponse(text="42", token=4, finish_reason="stop", usage=usage),
+        ]
+        thinking = parse_thinking_models(
+            _queue_source(tokens),
+            think_start=THINKING_START,
+            think_end=THINKING_END,
+            starts_in_thinking=True,
+        )
+        results = _step_until_finish(
+            count_reasoning_tokens(parse_deepseek_v32(thinking))
+        )
+        final = [
+            r
+            for r in results
+            if isinstance(r, GenerationResponse) and r.finish_reason is not None
+        ]
+        assert len(final) == 1
+        assert final[0].usage is not None
+        assert final[0].usage.completion_tokens_details.reasoning_tokens == 2
+
+
 class TestBatchGeneratorSingleNext:
     def test_finish_reason_with_buffered_tokens_drain_loop(self):
         from exo.worker.runner.llm_inference.batch_generator import GeneratorQueue
diff --git a/uv.lock b/uv.lock
index ab9c65db0..f0b432b91 100644
--- a/uv.lock
+++ b/uv.lock
@@ -395,7 +395,7 @@ dependencies = [
     { name = "mflux", marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
-    { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
+    { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "mlx-vlm", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "msgspec", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "openai-harmony", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
@@ -418,19 +418,19 @@ cpu = [
     { name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-3-exo-cpu') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "(sys_platform == 'darwin' and extra == 'extra-3-exo-cpu') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "mlx-cpu", marker = "sys_platform == 'linux'" },
-    { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'linux'" },
+    { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'linux'" },
 ]
 cuda12 = [
     { name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" },
     { name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "(sys_platform == 'darwin' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" },
     { name = "mlx-cuda-12", marker = "sys_platform == 'linux'" },
-    { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'linux'" },
+    { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'linux'" },
 ]
 cuda13 = [
     { name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" },
     { name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "(sys_platform == 'darwin' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" },
     { name = "mlx-cuda-13", marker = "sys_platform == 'linux'" },
-    { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'linux'" },
+    { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'linux'" },
 ]
 
 [package.dev-dependencies]
@@ -1326,7 +1326,7 @@ wheels = [
 [[package]]
 name = "mlx-lm"
 version = "0.31.3"
-source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }
+source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }
 resolution-markers = [
     "sys_platform == 'darwin'",
     "sys_platform == 'linux'",
@@ -1353,7 +1353,7 @@ dependencies = [
     { name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "mlx-lm", version = "0.31.3", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra != 'extra-3-exo-cpu' and extra != 'extra-3-exo-cuda12' and extra != 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
-    { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'darwin' or (sys_platform == 'linux' and extra == 'extra-3-exo-cpu') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda12') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
+    { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'darwin' or (sys_platform == 'linux' and extra == 'extra-3-exo-cpu') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda12') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "numpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "opencv-python", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "pillow", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
@@ -2634,7 +2634,7 @@ wheels = [
 
 [[package]]
 name = "transformers"
-version = "5.3.0"
+version = "5.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
@@ -2645,11 +2645,11 @@ dependencies = [
     { name = "safetensors", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "tokenizers", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
     { name = "tqdm", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
-    { name = "typer", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
+    { name = "typer-slim", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fc/1a/70e830d53ecc96ce69cfa8de38f163712d2b43ac52fbd743f39f56025c31/transformers-5.3.0.tar.gz", hash = "sha256:009555b364029da9e2946d41f1c5de9f15e6b1df46b189b7293f33a161b9c557", size = 8830831, upload-time = "2026-03-04T17:41:46.119Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/bd/7e/8a0c57d562015e5b16c97c1f0b8e0e92ead2c7c20513225dc12c2043ba9f/transformers-5.2.0.tar.gz", hash = "sha256:0088b8b46ccc9eff1a1dca72b5d618a5ee3b1befc3e418c9512b35dea9f9a650", size = 8618176, upload-time = "2026-02-16T18:54:02.867Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b8/88/ae8320064e32679a5429a2c9ebbc05c2bf32cefb6e076f9b07f6d685a9b4/transformers-5.3.0-py3-none-any.whl", hash = "sha256:50ac8c89c3c7033444fb3f9f53138096b997ebb70d4b5e50a2e810bf12d3d29a", size = 10661827, upload-time = "2026-03-04T17:41:42.722Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/93/79754b0ca486e556c2b95d4f5afc66aaf4b260694f3d6e1b51da2d036691/transformers-5.2.0-py3-none-any.whl", hash = "sha256:9ecaf243dc45bee11a7d93f8caf03746accc0cb069181bbf4ad8566c53e854b4", size = 10403304, upload-time = "2026-02-16T18:53:59.699Z" },
 ]
 
 [[package]]
@@ -2706,6 +2706,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" },
 ]
 
+[[package]]
+name = "typer-slim"
+version = "0.24.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typer", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a7/a7/e6aecc4b4eb59598829a3b5076a93aff291b4fdaa2ded25efc4e1f4d219c/typer_slim-0.24.0.tar.gz", hash = "sha256:f0ed36127183f52ae6ced2ecb2521789995992c521a46083bfcdbb652d22ad34", size = 4776, upload-time = "2026-02-16T22:08:51.2Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/24/5480c20380dfd18cf33d14784096dca45a24eae6102e91d49a718d3b6855/typer_slim-0.24.0-py3-none-any.whl", hash = "sha256:d5d7ee1ee2834d5020c7c616ed5e0d0f29b9a4b1dd283bdebae198ec09778d0e", size = 3394, upload-time = "2026-02-16T22:08:49.92Z" },
+]
+
 [[package]]
 name = "types-aiofiles"
 version = "25.1.0.20251011"