Fix Gemma 4 E2B TP + DeepSeek V32 thinking parsing (#1967)

This commit is contained in:
rltakashige
2026-04-23 02:50:39 +01:00
committed by GitHub
parent 8993ccaf09
commit 3894cf134e
9 changed files with 198 additions and 53 deletions

View File

@@ -1370,9 +1370,11 @@ class Gemma4ShardingStrategy(TensorParallelShardingStrategy):
attn = layer.self_attn
attn.q_proj = self.all_to_sharded_linear(attn.q_proj)
attn.k_proj = self.all_to_sharded_linear(attn.k_proj)
if not attn.use_k_eq_v:
attn.v_proj = self.all_to_sharded_linear(attn.v_proj)
has_kv: bool = cast(bool, attn.has_kv)
if has_kv:
attn.k_proj = self.all_to_sharded_linear(attn.k_proj)
if not attn.use_k_eq_v:
attn.v_proj = self.all_to_sharded_linear(attn.v_proj)
attn.o_proj = self.sharded_to_all_linear(attn.o_proj)
attn.n_heads //= self.N
attn.n_kv_heads //= self.N

View File

@@ -17,6 +17,13 @@ TOOL_CALLS_START = f"<{DSML_TOKEN}function_calls>"
TOOL_CALLS_END = f"</{DSML_TOKEN}function_calls>"
_ORPHAN_THINK_END = ASSISTANT_TOKEN + THINKING_END
_FIXED_THINK_BLOCK = ASSISTANT_TOKEN + THINKING_START + "\n" + THINKING_END
_FUNCTION_RESULTS_CLOSE = "</function_results>"
_ORPHAN_TOOL_RESULT_SUFFIX = _FUNCTION_RESULTS_CLOSE + "\n\n" + THINKING_END
_EMPTY_THINK_BLOCKS = (
THINKING_START + "\n\n" + THINKING_END,
THINKING_START + "\n" + THINKING_END,
THINKING_START + THINKING_END,
)
def encode_messages(
@@ -35,7 +42,11 @@ def encode_messages(
add_default_bos_token=add_default_bos_token,
tools=tools,
)
return prompt.replace(_ORPHAN_THINK_END, _FIXED_THINK_BLOCK)
prompt = prompt.replace(_ORPHAN_TOOL_RESULT_SUFFIX, _FUNCTION_RESULTS_CLOSE)
prompt = prompt.replace(_ORPHAN_THINK_END, _FIXED_THINK_BLOCK)
for empty in _EMPTY_THINK_BLOCKS:
prompt = prompt.replace(empty, "")
return prompt
_INVOKE_PATTERN = re.compile(

View File

@@ -547,7 +547,6 @@ def render_chat_template(
)
if partial_assistant_content:
prompt += partial_assistant_content
logger.info(prompt)
return prompt
for msg in formatted_messages:

View File

@@ -79,6 +79,13 @@ def apply_all_parsers(
issubclass(model_type, DeepseekV32Model)
and "deepseek" in model_id.normalize().lower()
):
if tokenizer.has_thinking:
generator = parse_thinking_models(
generator,
tokenizer.think_start,
tokenizer.think_end,
starts_in_thinking=detect_thinking_prompt_suffix(prompt, tokenizer),
)
generator = parse_deepseek_v32(generator)
else:
if tokenizer.has_thinking:
@@ -210,11 +217,10 @@ def parse_deepseek_v32(
Uses accumulated-text matching (not per-token marker checks) because
DSML markers like <DSMLfunction_calls> may span multiple tokens.
Also handles <think>...</think> blocks for thinking mode.
Thinking tag handling is delegated to parse_thinking_models, which
wraps this parser in apply_all_parsers.
"""
from exo.worker.engines.mlx.dsml_encoding import (
THINKING_END,
THINKING_START,
TOOL_CALLS_END,
TOOL_CALLS_START,
parse_dsml_output,
@@ -222,7 +228,6 @@ def parse_deepseek_v32(
accumulated = ""
in_tool_call = False
thinking = False
# Tokens buffered while we detect the start of a DSML block
pending_buffer: list[GenerationResponse] = []
# Text accumulated during a tool call block
@@ -264,29 +269,6 @@ def parse_deepseek_v32(
yield response
break
# ── Handle thinking tags ──
if not thinking and THINKING_START in response.text:
thinking = True
# Yield any text before the <think> tag
before = response.text[: response.text.index(THINKING_START)]
if before:
yield response.model_copy(update={"text": before})
continue
if thinking and THINKING_END in response.text:
thinking = False
# Yield any text after the </think> tag
after = response.text[
response.text.index(THINKING_END) + len(THINKING_END) :
]
if after:
yield response.model_copy(update={"text": after, "is_thinking": False})
continue
if thinking:
yield response.model_copy(update={"is_thinking": True})
continue
# ── Handle tool call accumulation ──
if in_tool_call:
tool_call_text += response.text

View File

@@ -96,7 +96,12 @@ def run_gpt_oss_pipeline_device(
n_layers=24,
)
model, tokenizer = shard_and_load(shard_meta, group, on_layer_loaded=None)
gen = shard_and_load(shard_meta, group)
try:
while True:
next(gen)
except StopIteration as stop:
model, tokenizer = stop.value
model = cast(Model, model)
# Generate a prompt of exact token length
@@ -172,7 +177,12 @@ def run_gpt_oss_tensor_parallel_device(
n_layers=24,
)
model, tokenizer = shard_and_load(shard_meta, group, on_layer_loaded=None)
gen = shard_and_load(shard_meta, group)
try:
while True:
next(gen)
except StopIteration as stop:
model, tokenizer = stop.value
model = cast(Model, model)
base_text = "The quick brown fox jumps over the lazy dog. "

View File

@@ -174,7 +174,12 @@ def _run_pipeline_device(
n_layers=TOTAL_LAYERS,
)
model, tokenizer = shard_and_load(shard_meta, group, on_layer_loaded=None)
gen = shard_and_load(shard_meta, group)
try:
while True:
next(gen)
except StopIteration as stop:
model, tokenizer = stop.value
model = cast(Any, model)
prompt, task = _build_prompt(tokenizer, prompt_tokens)

View File

@@ -20,7 +20,25 @@ from exo.worker.engines.mlx.dsml_encoding import (
encode_messages,
parse_dsml_output,
)
from exo.worker.runner.llm_inference.model_output_parsers import parse_deepseek_v32
from exo.worker.runner.llm_inference.model_output_parsers import (
parse_deepseek_v32,
parse_thinking_models,
)
def _parse_deepseek_with_thinking(
source: Generator[GenerationResponse | None],
starts_in_thinking: bool = False,
) -> Generator[GenerationResponse | ToolCallResponse | None]:
return parse_deepseek_v32(
parse_thinking_models(
source,
think_start=THINKING_START,
think_end=THINKING_END,
starts_in_thinking=starts_in_thinking,
)
)
# ── Shared fixtures ──────────────────────────────────────────────
@@ -333,9 +351,7 @@ class TestE2EThinkingAndToolCall:
assert prompt.endswith(THINKING_START)
# Simulate: model outputs <think>, thinks, closes thinking, then tool call.
# In the full pipeline, parse_thinking_models handles the case where
# <think> is in the prompt. Here we test parse_deepseek_v32 directly,
# which detects <think>/<think> markers in the stream.
# Use the full production chain (parse_thinking_models → parse_deepseek_v32).
model_tokens = [
THINKING_START,
"The user wants weather",
@@ -353,7 +369,7 @@ class TestE2EThinkingAndToolCall:
TOOL_CALLS_END,
]
results = list(parse_deepseek_v32(_simulate_tokens(model_tokens)))
results = list(_parse_deepseek_with_thinking(_simulate_tokens(model_tokens)))
gen_results = [r for r in results if isinstance(r, GenerationResponse)]
tool_results = [r for r in results if isinstance(r, ToolCallResponse)]
@@ -387,7 +403,7 @@ class TestE2EThinkingAndToolCall:
prompt_no_think = encode_messages(
messages, tools=_WEATHER_TOOLS, thinking_mode="chat"
)
assert prompt_no_think.endswith(THINKING_END)
assert not prompt_no_think.endswith(THINKING_START)
# Both should have the same tool definitions
assert "get_weather" in prompt_think
@@ -597,7 +613,9 @@ class TestE2EFullRoundTrip:
f"</{DSML_TOKEN}invoke>\n",
TOOL_CALLS_END,
]
results_1 = list(parse_deepseek_v32(_simulate_tokens(model_tokens_1)))
results_1 = list(
_parse_deepseek_with_thinking(_simulate_tokens(model_tokens_1))
)
# Verify: thinking tokens + tool call
gen_1 = [r for r in results_1 if isinstance(r, GenerationResponse)]
@@ -660,7 +678,9 @@ class TestE2EFullRoundTrip:
THINKING_END,
"The weather in Hangzhou is currently cloudy with temperatures between 7°C and 13°C.",
]
results_2 = list(parse_deepseek_v32(_simulate_tokens(model_tokens_2)))
results_2 = list(
_parse_deepseek_with_thinking(_simulate_tokens(model_tokens_2))
)
gen_2 = [r for r in results_2 if isinstance(r, GenerationResponse)]
tool_2 = [r for r in results_2 if isinstance(r, ToolCallResponse)]

View File

@@ -380,6 +380,110 @@ class TestGenericToolCallsFinishReason:
# ── Double parser chain (parse_thinking_models → parse_deepseek_v32) ──
class TestDeepSeekV32StartsInThinking:
"""Regression tests for deepseek v3.2 where the chat template appends
<think> to the prompt so the model starts already inside a thinking block.
"""
def test_reasoning_tagged_when_starts_in_thinking(self):
tokens = [
_make_response("let me", 0),
_make_response(" think", 1),
_make_response(THINKING_END, 2),
_make_response("\n", 3),
_make_response("42", 4, finish_reason="stop"),
]
thinking = parse_thinking_models(
_queue_source(tokens),
think_start=THINKING_START,
think_end=THINKING_END,
starts_in_thinking=True,
)
results = _step_until_finish(parse_deepseek_v32(thinking))
gens = [
r
for r in results
if isinstance(r, GenerationResponse) and r.finish_reason is None
]
texts = [(r.text, r.is_thinking) for r in gens]
assert texts == [("let me", True), (" think", True), ("\n", False)]
final = [
r
for r in results
if isinstance(r, GenerationResponse) and r.finish_reason is not None
]
assert len(final) == 1
assert final[0].text == "42"
assert final[0].is_thinking is False
def test_starts_in_thinking_then_tool_call(self):
tokens = [
_make_response("need weather", 0),
_make_response(THINKING_END, 1),
_make_response("\n\n", 2),
_make_response(TOOL_CALLS_START, 3),
_make_response("\n", 4),
_make_response(f'<{DSML_TOKEN}invoke name="get_weather">\n', 5),
_make_response(
f'<{DSML_TOKEN}parameter name="city" string="true">NYC</{DSML_TOKEN}parameter>\n',
6,
),
_make_response(f"</{DSML_TOKEN}invoke>\n", 7),
_make_response(TOOL_CALLS_END, 8, finish_reason="stop"),
]
thinking = parse_thinking_models(
_queue_source(tokens),
think_start=THINKING_START,
think_end=THINKING_END,
starts_in_thinking=True,
)
results = _step_until_finish(parse_deepseek_v32(thinking))
reasoning_gens = [
r
for r in results
if isinstance(r, GenerationResponse)
and r.finish_reason is None
and r.is_thinking
]
assert [r.text for r in reasoning_gens] == ["need weather"]
tool_results = [r for r in results if isinstance(r, ToolCallResponse)]
assert len(tool_results) == 1
assert tool_results[0].tool_calls[0].name == "get_weather"
def test_reasoning_tokens_counted_starts_in_thinking(self):
usage = Usage(
prompt_tokens=10,
completion_tokens=5,
total_tokens=15,
prompt_tokens_details=PromptTokensDetails(cached_tokens=0),
completion_tokens_details=CompletionTokensDetails(reasoning_tokens=0),
)
tokens = [
_make_response("reasoning", 0),
_make_response(" more", 1),
_make_response(THINKING_END, 2),
_make_response("\n", 3),
GenerationResponse(text="42", token=4, finish_reason="stop", usage=usage),
]
thinking = parse_thinking_models(
_queue_source(tokens),
think_start=THINKING_START,
think_end=THINKING_END,
starts_in_thinking=True,
)
results = _step_until_finish(
count_reasoning_tokens(parse_deepseek_v32(thinking))
)
final = [
r
for r in results
if isinstance(r, GenerationResponse) and r.finish_reason is not None
]
assert len(final) == 1
assert final[0].usage is not None
assert final[0].usage.completion_tokens_details.reasoning_tokens == 2
class TestBatchGeneratorSingleNext:
def test_finish_reason_with_buffered_tokens_drain_loop(self):
from exo.worker.runner.llm_inference.batch_generator import GeneratorQueue

32
uv.lock generated
View File

@@ -395,7 +395,7 @@ dependencies = [
{ name = "mflux", marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx-vlm", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "msgspec", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "openai-harmony", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
@@ -418,19 +418,19 @@ cpu = [
{ name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-3-exo-cpu') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "(sys_platform == 'darwin' and extra == 'extra-3-exo-cpu') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx-cpu", marker = "sys_platform == 'linux'" },
{ name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'linux'" },
{ name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'linux'" },
]
cuda12 = [
{ name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "(sys_platform == 'darwin' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx-cuda-12", marker = "sys_platform == 'linux'" },
{ name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'linux'" },
{ name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'linux'" },
]
cuda13 = [
{ name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "(sys_platform == 'darwin' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx-cuda-13", marker = "sys_platform == 'linux'" },
{ name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'linux'" },
{ name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'linux'" },
]
[package.dev-dependencies]
@@ -1326,7 +1326,7 @@ wheels = [
[[package]]
name = "mlx-lm"
version = "0.31.3"
source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }
source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }
resolution-markers = [
"sys_platform == 'darwin'",
"sys_platform == 'linux'",
@@ -1353,7 +1353,7 @@ dependencies = [
{ name = "mlx", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx", version = "0.31.2.dev20260422+ec49d18e", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#ec49d18ec4cfba0e0c7a37f20d1cf4d75fe56731" }, marker = "sys_platform == 'darwin' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx-lm", version = "0.31.3", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra != 'extra-3-exo-cpu' and extra != 'extra-3-exo-cuda12' and extra != 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#62bbd5b2ef8d29d02dd08af303c22518341a46b0" }, marker = "sys_platform == 'darwin' or (sys_platform == 'linux' and extra == 'extra-3-exo-cpu') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda12') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "mlx-lm", version = "0.31.3", source = { git = "https://github.com/rltakashige/mlx-lm?branch=leo%2Ffix-arrayscache-leak#c7010341e1f41ac15815feb5dc55134f44e3b044" }, marker = "sys_platform == 'darwin' or (sys_platform == 'linux' and extra == 'extra-3-exo-cpu') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda12') or (sys_platform == 'linux' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "numpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "opencv-python", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "pillow", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
@@ -2634,7 +2634,7 @@ wheels = [
[[package]]
name = "transformers"
version = "5.3.0"
version = "5.2.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "huggingface-hub", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
@@ -2645,11 +2645,11 @@ dependencies = [
{ name = "safetensors", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "tokenizers", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "tqdm", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "typer", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
{ name = "typer-slim", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/fc/1a/70e830d53ecc96ce69cfa8de38f163712d2b43ac52fbd743f39f56025c31/transformers-5.3.0.tar.gz", hash = "sha256:009555b364029da9e2946d41f1c5de9f15e6b1df46b189b7293f33a161b9c557", size = 8830831, upload-time = "2026-03-04T17:41:46.119Z" }
sdist = { url = "https://files.pythonhosted.org/packages/bd/7e/8a0c57d562015e5b16c97c1f0b8e0e92ead2c7c20513225dc12c2043ba9f/transformers-5.2.0.tar.gz", hash = "sha256:0088b8b46ccc9eff1a1dca72b5d618a5ee3b1befc3e418c9512b35dea9f9a650", size = 8618176, upload-time = "2026-02-16T18:54:02.867Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b8/88/ae8320064e32679a5429a2c9ebbc05c2bf32cefb6e076f9b07f6d685a9b4/transformers-5.3.0-py3-none-any.whl", hash = "sha256:50ac8c89c3c7033444fb3f9f53138096b997ebb70d4b5e50a2e810bf12d3d29a", size = 10661827, upload-time = "2026-03-04T17:41:42.722Z" },
{ url = "https://files.pythonhosted.org/packages/4e/93/79754b0ca486e556c2b95d4f5afc66aaf4b260694f3d6e1b51da2d036691/transformers-5.2.0-py3-none-any.whl", hash = "sha256:9ecaf243dc45bee11a7d93f8caf03746accc0cb069181bbf4ad8566c53e854b4", size = 10403304, upload-time = "2026-02-16T18:53:59.699Z" },
]
[[package]]
@@ -2706,6 +2706,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" },
]
[[package]]
name = "typer-slim"
version = "0.24.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "typer", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda12') or (extra == 'extra-3-exo-cpu' and extra == 'extra-3-exo-cuda13') or (extra == 'extra-3-exo-cuda12' and extra == 'extra-3-exo-cuda13')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/a7/a7/e6aecc4b4eb59598829a3b5076a93aff291b4fdaa2ded25efc4e1f4d219c/typer_slim-0.24.0.tar.gz", hash = "sha256:f0ed36127183f52ae6ced2ecb2521789995992c521a46083bfcdbb652d22ad34", size = 4776, upload-time = "2026-02-16T22:08:51.2Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a7/24/5480c20380dfd18cf33d14784096dca45a24eae6102e91d49a718d3b6855/typer_slim-0.24.0-py3-none-any.whl", hash = "sha256:d5d7ee1ee2834d5020c7c616ed5e0d0f29b9a4b1dd283bdebae198ec09778d0e", size = 3394, upload-time = "2026-02-16T22:08:49.92Z" },
]
[[package]]
name = "types-aiofiles"
version = "25.1.0.20251011"