From 3894cf134edc91ce34b4bbefc6d6e96afedd2597 Mon Sep 17 00:00:00 2001 From: rltakashige Date: Thu, 23 Apr 2026 02:50:39 +0100 Subject: [PATCH] Fix Gemma 4 E2B TP + DeepSeek V32 thinking parsing (#1967) --- src/exo/worker/engines/mlx/auto_parallel.py | 8 +- src/exo/worker/engines/mlx/dsml_encoding.py | 13 ++- src/exo/worker/engines/mlx/utils_mlx.py | 1 - .../llm_inference/model_output_parsers.py | 36 ++---- .../tests/unittests/test_mlx/conftest.py | 14 ++- .../test_pipeline_prefill_callbacks.py | 7 +- .../unittests/test_runner/test_dsml_e2e.py | 36 ++++-- .../test_runner/test_finish_reason_sse.py | 104 ++++++++++++++++++ uv.lock | 32 ++++-- 9 files changed, 198 insertions(+), 53 deletions(-) diff --git a/src/exo/worker/engines/mlx/auto_parallel.py b/src/exo/worker/engines/mlx/auto_parallel.py index 443979f19..9df66b776 100644 --- a/src/exo/worker/engines/mlx/auto_parallel.py +++ b/src/exo/worker/engines/mlx/auto_parallel.py @@ -1370,9 +1370,11 @@ class Gemma4ShardingStrategy(TensorParallelShardingStrategy): attn = layer.self_attn attn.q_proj = self.all_to_sharded_linear(attn.q_proj) - attn.k_proj = self.all_to_sharded_linear(attn.k_proj) - if not attn.use_k_eq_v: - attn.v_proj = self.all_to_sharded_linear(attn.v_proj) + has_kv: bool = cast(bool, attn.has_kv) + if has_kv: + attn.k_proj = self.all_to_sharded_linear(attn.k_proj) + if not attn.use_k_eq_v: + attn.v_proj = self.all_to_sharded_linear(attn.v_proj) attn.o_proj = self.sharded_to_all_linear(attn.o_proj) attn.n_heads //= self.N attn.n_kv_heads //= self.N diff --git a/src/exo/worker/engines/mlx/dsml_encoding.py b/src/exo/worker/engines/mlx/dsml_encoding.py index 9d1dfdd27..66de78c89 100644 --- a/src/exo/worker/engines/mlx/dsml_encoding.py +++ b/src/exo/worker/engines/mlx/dsml_encoding.py @@ -17,6 +17,13 @@ TOOL_CALLS_START = f"<{DSML_TOKEN}function_calls>" TOOL_CALLS_END = f"" _ORPHAN_THINK_END = ASSISTANT_TOKEN + THINKING_END _FIXED_THINK_BLOCK = ASSISTANT_TOKEN + THINKING_START + "\n" + THINKING_END +_FUNCTION_RESULTS_CLOSE = "" +_ORPHAN_TOOL_RESULT_SUFFIX = _FUNCTION_RESULTS_CLOSE + "\n\n" + THINKING_END +_EMPTY_THINK_BLOCKS = ( + THINKING_START + "\n\n" + THINKING_END, + THINKING_START + "\n" + THINKING_END, + THINKING_START + THINKING_END, +) def encode_messages( @@ -35,7 +42,11 @@ def encode_messages( add_default_bos_token=add_default_bos_token, tools=tools, ) - return prompt.replace(_ORPHAN_THINK_END, _FIXED_THINK_BLOCK) + prompt = prompt.replace(_ORPHAN_TOOL_RESULT_SUFFIX, _FUNCTION_RESULTS_CLOSE) + prompt = prompt.replace(_ORPHAN_THINK_END, _FIXED_THINK_BLOCK) + for empty in _EMPTY_THINK_BLOCKS: + prompt = prompt.replace(empty, "") + return prompt _INVOKE_PATTERN = re.compile( diff --git a/src/exo/worker/engines/mlx/utils_mlx.py b/src/exo/worker/engines/mlx/utils_mlx.py index 1a16e26d0..c884c1062 100644 --- a/src/exo/worker/engines/mlx/utils_mlx.py +++ b/src/exo/worker/engines/mlx/utils_mlx.py @@ -547,7 +547,6 @@ def render_chat_template( ) if partial_assistant_content: prompt += partial_assistant_content - logger.info(prompt) return prompt for msg in formatted_messages: diff --git a/src/exo/worker/runner/llm_inference/model_output_parsers.py b/src/exo/worker/runner/llm_inference/model_output_parsers.py index 906a26558..944b536b5 100644 --- a/src/exo/worker/runner/llm_inference/model_output_parsers.py +++ b/src/exo/worker/runner/llm_inference/model_output_parsers.py @@ -79,6 +79,13 @@ def apply_all_parsers( issubclass(model_type, DeepseekV32Model) and "deepseek" in model_id.normalize().lower() ): + if tokenizer.has_thinking: + generator = parse_thinking_models( + generator, + tokenizer.think_start, + tokenizer.think_end, + starts_in_thinking=detect_thinking_prompt_suffix(prompt, tokenizer), + ) generator = parse_deepseek_v32(generator) else: if tokenizer.has_thinking: @@ -210,11 +217,10 @@ def parse_deepseek_v32( Uses accumulated-text matching (not per-token marker checks) because DSML markers like <|DSML|function_calls> may span multiple tokens. - Also handles ... blocks for thinking mode. + Thinking tag handling is delegated to parse_thinking_models, which + wraps this parser in apply_all_parsers. """ from exo.worker.engines.mlx.dsml_encoding import ( - THINKING_END, - THINKING_START, TOOL_CALLS_END, TOOL_CALLS_START, parse_dsml_output, @@ -222,7 +228,6 @@ def parse_deepseek_v32( accumulated = "" in_tool_call = False - thinking = False # Tokens buffered while we detect the start of a DSML block pending_buffer: list[GenerationResponse] = [] # Text accumulated during a tool call block @@ -264,29 +269,6 @@ def parse_deepseek_v32( yield response break - # ── Handle thinking tags ── - if not thinking and THINKING_START in response.text: - thinking = True - # Yield any text before the tag - before = response.text[: response.text.index(THINKING_START)] - if before: - yield response.model_copy(update={"text": before}) - continue - - if thinking and THINKING_END in response.text: - thinking = False - # Yield any text after the tag - after = response.text[ - response.text.index(THINKING_END) + len(THINKING_END) : - ] - if after: - yield response.model_copy(update={"text": after, "is_thinking": False}) - continue - - if thinking: - yield response.model_copy(update={"is_thinking": True}) - continue - # ── Handle tool call accumulation ── if in_tool_call: tool_call_text += response.text diff --git a/src/exo/worker/tests/unittests/test_mlx/conftest.py b/src/exo/worker/tests/unittests/test_mlx/conftest.py index 98bd46946..a4d711393 100644 --- a/src/exo/worker/tests/unittests/test_mlx/conftest.py +++ b/src/exo/worker/tests/unittests/test_mlx/conftest.py @@ -96,7 +96,12 @@ def run_gpt_oss_pipeline_device( n_layers=24, ) - model, tokenizer = shard_and_load(shard_meta, group, on_layer_loaded=None) + gen = shard_and_load(shard_meta, group) + try: + while True: + next(gen) + except StopIteration as stop: + model, tokenizer = stop.value model = cast(Model, model) # Generate a prompt of exact token length @@ -172,7 +177,12 @@ def run_gpt_oss_tensor_parallel_device( n_layers=24, ) - model, tokenizer = shard_and_load(shard_meta, group, on_layer_loaded=None) + gen = shard_and_load(shard_meta, group) + try: + while True: + next(gen) + except StopIteration as stop: + model, tokenizer = stop.value model = cast(Model, model) base_text = "The quick brown fox jumps over the lazy dog. " diff --git a/src/exo/worker/tests/unittests/test_mlx/test_pipeline_prefill_callbacks.py b/src/exo/worker/tests/unittests/test_mlx/test_pipeline_prefill_callbacks.py index 161ad7d76..1972abee3 100644 --- a/src/exo/worker/tests/unittests/test_mlx/test_pipeline_prefill_callbacks.py +++ b/src/exo/worker/tests/unittests/test_mlx/test_pipeline_prefill_callbacks.py @@ -174,7 +174,12 @@ def _run_pipeline_device( n_layers=TOTAL_LAYERS, ) - model, tokenizer = shard_and_load(shard_meta, group, on_layer_loaded=None) + gen = shard_and_load(shard_meta, group) + try: + while True: + next(gen) + except StopIteration as stop: + model, tokenizer = stop.value model = cast(Any, model) prompt, task = _build_prompt(tokenizer, prompt_tokens) diff --git a/src/exo/worker/tests/unittests/test_runner/test_dsml_e2e.py b/src/exo/worker/tests/unittests/test_runner/test_dsml_e2e.py index 74efbef1d..8b6c6ded8 100644 --- a/src/exo/worker/tests/unittests/test_runner/test_dsml_e2e.py +++ b/src/exo/worker/tests/unittests/test_runner/test_dsml_e2e.py @@ -20,7 +20,25 @@ from exo.worker.engines.mlx.dsml_encoding import ( encode_messages, parse_dsml_output, ) -from exo.worker.runner.llm_inference.model_output_parsers import parse_deepseek_v32 +from exo.worker.runner.llm_inference.model_output_parsers import ( + parse_deepseek_v32, + parse_thinking_models, +) + + +def _parse_deepseek_with_thinking( + source: Generator[GenerationResponse | None], + starts_in_thinking: bool = False, +) -> Generator[GenerationResponse | ToolCallResponse | None]: + return parse_deepseek_v32( + parse_thinking_models( + source, + think_start=THINKING_START, + think_end=THINKING_END, + starts_in_thinking=starts_in_thinking, + ) + ) + # ── Shared fixtures ────────────────────────────────────────────── @@ -333,9 +351,7 @@ class TestE2EThinkingAndToolCall: assert prompt.endswith(THINKING_START) # Simulate: model outputs , thinks, closes thinking, then tool call. - # In the full pipeline, parse_thinking_models handles the case where - # is in the prompt. Here we test parse_deepseek_v32 directly, - # which detects / markers in the stream. + # Use the full production chain (parse_thinking_models → parse_deepseek_v32). model_tokens = [ THINKING_START, "The user wants weather", @@ -353,7 +369,7 @@ class TestE2EThinkingAndToolCall: TOOL_CALLS_END, ] - results = list(parse_deepseek_v32(_simulate_tokens(model_tokens))) + results = list(_parse_deepseek_with_thinking(_simulate_tokens(model_tokens))) gen_results = [r for r in results if isinstance(r, GenerationResponse)] tool_results = [r for r in results if isinstance(r, ToolCallResponse)] @@ -387,7 +403,7 @@ class TestE2EThinkingAndToolCall: prompt_no_think = encode_messages( messages, tools=_WEATHER_TOOLS, thinking_mode="chat" ) - assert prompt_no_think.endswith(THINKING_END) + assert not prompt_no_think.endswith(THINKING_START) # Both should have the same tool definitions assert "get_weather" in prompt_think @@ -597,7 +613,9 @@ class TestE2EFullRoundTrip: f"\n", TOOL_CALLS_END, ] - results_1 = list(parse_deepseek_v32(_simulate_tokens(model_tokens_1))) + results_1 = list( + _parse_deepseek_with_thinking(_simulate_tokens(model_tokens_1)) + ) # Verify: thinking tokens + tool call gen_1 = [r for r in results_1 if isinstance(r, GenerationResponse)] @@ -660,7 +678,9 @@ class TestE2EFullRoundTrip: THINKING_END, "The weather in Hangzhou is currently cloudy with temperatures between 7°C and 13°C.", ] - results_2 = list(parse_deepseek_v32(_simulate_tokens(model_tokens_2))) + results_2 = list( + _parse_deepseek_with_thinking(_simulate_tokens(model_tokens_2)) + ) gen_2 = [r for r in results_2 if isinstance(r, GenerationResponse)] tool_2 = [r for r in results_2 if isinstance(r, ToolCallResponse)] diff --git a/src/exo/worker/tests/unittests/test_runner/test_finish_reason_sse.py b/src/exo/worker/tests/unittests/test_runner/test_finish_reason_sse.py index 90afe7f4f..6771b3d8f 100644 --- a/src/exo/worker/tests/unittests/test_runner/test_finish_reason_sse.py +++ b/src/exo/worker/tests/unittests/test_runner/test_finish_reason_sse.py @@ -380,6 +380,110 @@ class TestGenericToolCallsFinishReason: # ── Double parser chain (parse_thinking_models → parse_deepseek_v32) ── +class TestDeepSeekV32StartsInThinking: + """Regression tests for deepseek v3.2 where the chat template appends + to the prompt so the model starts already inside a thinking block. + """ + + def test_reasoning_tagged_when_starts_in_thinking(self): + tokens = [ + _make_response("let me", 0), + _make_response(" think", 1), + _make_response(THINKING_END, 2), + _make_response("\n", 3), + _make_response("42", 4, finish_reason="stop"), + ] + thinking = parse_thinking_models( + _queue_source(tokens), + think_start=THINKING_START, + think_end=THINKING_END, + starts_in_thinking=True, + ) + results = _step_until_finish(parse_deepseek_v32(thinking)) + gens = [ + r + for r in results + if isinstance(r, GenerationResponse) and r.finish_reason is None + ] + texts = [(r.text, r.is_thinking) for r in gens] + assert texts == [("let me", True), (" think", True), ("\n", False)] + final = [ + r + for r in results + if isinstance(r, GenerationResponse) and r.finish_reason is not None + ] + assert len(final) == 1 + assert final[0].text == "42" + assert final[0].is_thinking is False + + def test_starts_in_thinking_then_tool_call(self): + tokens = [ + _make_response("need weather", 0), + _make_response(THINKING_END, 1), + _make_response("\n\n", 2), + _make_response(TOOL_CALLS_START, 3), + _make_response("\n", 4), + _make_response(f'<{DSML_TOKEN}invoke name="get_weather">\n', 5), + _make_response( + f'<{DSML_TOKEN}parameter name="city" string="true">NYC\n', + 6, + ), + _make_response(f"\n", 7), + _make_response(TOOL_CALLS_END, 8, finish_reason="stop"), + ] + thinking = parse_thinking_models( + _queue_source(tokens), + think_start=THINKING_START, + think_end=THINKING_END, + starts_in_thinking=True, + ) + results = _step_until_finish(parse_deepseek_v32(thinking)) + reasoning_gens = [ + r + for r in results + if isinstance(r, GenerationResponse) + and r.finish_reason is None + and r.is_thinking + ] + assert [r.text for r in reasoning_gens] == ["need weather"] + tool_results = [r for r in results if isinstance(r, ToolCallResponse)] + assert len(tool_results) == 1 + assert tool_results[0].tool_calls[0].name == "get_weather" + + def test_reasoning_tokens_counted_starts_in_thinking(self): + usage = Usage( + prompt_tokens=10, + completion_tokens=5, + total_tokens=15, + prompt_tokens_details=PromptTokensDetails(cached_tokens=0), + completion_tokens_details=CompletionTokensDetails(reasoning_tokens=0), + ) + tokens = [ + _make_response("reasoning", 0), + _make_response(" more", 1), + _make_response(THINKING_END, 2), + _make_response("\n", 3), + GenerationResponse(text="42", token=4, finish_reason="stop", usage=usage), + ] + thinking = parse_thinking_models( + _queue_source(tokens), + think_start=THINKING_START, + think_end=THINKING_END, + starts_in_thinking=True, + ) + results = _step_until_finish( + count_reasoning_tokens(parse_deepseek_v32(thinking)) + ) + final = [ + r + for r in results + if isinstance(r, GenerationResponse) and r.finish_reason is not None + ] + assert len(final) == 1 + assert final[0].usage is not None + assert final[0].usage.completion_tokens_details.reasoning_tokens == 2 + + class TestBatchGeneratorSingleNext: def test_finish_reason_with_buffered_tokens_drain_loop(self): from exo.worker.runner.llm_inference.batch_generator import GeneratorQueue diff --git a/uv.lock b/uv.lock index ab9c65db0..f0b432b91 100644 --- a/uv.lock +++ b/uv.lock @@ -395,7 +395,7 @@ dependencies = [ { name = "mflux", marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, - { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, + { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "mlx-vlm", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "msgspec", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "openai-harmony", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, @@ -418,19 +418,19 @@ cpu = [ { name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-3-exo-cpu') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "(sys_platform == 'darwin' and extra == 'extra-3-exo-cpu') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "mlx-cpu", marker = "sys_platform == 'linux'" }, - { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'linux'" }, + { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'linux'" }, ] cuda12 = [ { name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" }, { name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "(sys_platform == 'darwin' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" }, { name = "mlx-cuda-12", marker = "sys_platform == 'linux'" }, - { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'linux'" }, + { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'linux'" }, ] cuda13 = [ { name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" }, { name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "(sys_platform == 'darwin' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" }, { name = "mlx-cuda-13", marker = "sys_platform == 'linux'" }, - { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'linux'" }, + { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'linux'" }, ] [package.dev-dependencies] @@ -1326,7 +1326,7 @@ wheels = [ [[package]] name = "mlx-lm" version = "0.31.3" -source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" } +source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" } resolution-markers = [ "sys_platform == 'darwin'", "sys_platform == 'linux'", @@ -1353,7 +1353,7 @@ dependencies = [ { name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "mlx-lm", version = "0.31.3", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra != 'extra-3-exo-cpu' and extra != 'extra-3-exo-cuda12' and extra != 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, - { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'darwin' or (sys_platform == 'linux' and extra == 'extra-3-exo-cpu') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda12') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, + { name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'darwin' or (sys_platform == 'linux' and extra == 'extra-3-exo-cpu') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda12') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "numpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "opencv-python", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "pillow", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, @@ -2634,7 +2634,7 @@ wheels = [ [[package]] name = "transformers" -version = "5.3.0" +version = "5.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "huggingface-hub", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, @@ -2645,11 +2645,11 @@ dependencies = [ { name = "safetensors", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "tokenizers", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, { name = "tqdm", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, - { name = "typer", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, + { name = "typer-slim", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/fc/1a/70e830d53ecc96ce69cfa8de38f163712d2b43ac52fbd743f39f56025c31/transformers-5.3.0.tar.gz", hash = "sha256:009555b364029da9e2946d41f1c5de9f15e6b1df46b189b7293f33a161b9c557", size = 8830831, upload-time = "2026-03-04T17:41:46.119Z" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/7e/8a0c57d562015e5b16c97c1f0b8e0e92ead2c7c20513225dc12c2043ba9f/transformers-5.2.0.tar.gz", hash = "sha256:0088b8b46ccc9eff1a1dca72b5d618a5ee3b1befc3e418c9512b35dea9f9a650", size = 8618176, upload-time = "2026-02-16T18:54:02.867Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b8/88/ae8320064e32679a5429a2c9ebbc05c2bf32cefb6e076f9b07f6d685a9b4/transformers-5.3.0-py3-none-any.whl", hash = "sha256:50ac8c89c3c7033444fb3f9f53138096b997ebb70d4b5e50a2e810bf12d3d29a", size = 10661827, upload-time = "2026-03-04T17:41:42.722Z" }, + { url = "https://files.pythonhosted.org/packages/4e/93/79754b0ca486e556c2b95d4f5afc66aaf4b260694f3d6e1b51da2d036691/transformers-5.2.0-py3-none-any.whl", hash = "sha256:9ecaf243dc45bee11a7d93f8caf03746accc0cb069181bbf4ad8566c53e854b4", size = 10403304, upload-time = "2026-02-16T18:53:59.699Z" }, ] [[package]] @@ -2706,6 +2706,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" }, ] +[[package]] +name = "typer-slim" +version = "0.24.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typer", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a7/a7/e6aecc4b4eb59598829a3b5076a93aff291b4fdaa2ded25efc4e1f4d219c/typer_slim-0.24.0.tar.gz", hash = "sha256:f0ed36127183f52ae6ced2ecb2521789995992c521a46083bfcdbb652d22ad34", size = 4776, upload-time = "2026-02-16T22:08:51.2Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/24/5480c20380dfd18cf33d14784096dca45a24eae6102e91d49a718d3b6855/typer_slim-0.24.0-py3-none-any.whl", hash = "sha256:d5d7ee1ee2834d5020c7c616ed5e0d0f29b9a4b1dd283bdebae198ec09778d0e", size = 3394, upload-time = "2026-02-16T22:08:49.92Z" }, +] + [[package]] name = "types-aiofiles" version = "25.1.0.20251011"