This commit is contained in:
Ryuichi Leo Takashige
2026-02-02 21:33:16 +00:00
parent edb2015607
commit 8aeeb46d2f

View File

@@ -101,15 +101,11 @@ def _should_use_serial_processing(
"""
Determine if a ChatCompletion task requires serial processing.
GPT-OSS models have mixed cache types (KVCache + RotatingKVCache) that
don't work reliably with BatchGenerator's batched prefill.
Currently always returns False - batch mode handles all cases.
Post-processing (GPT-OSS, thinking models, tool calls) can be applied
per-request to the individual streams from the batch generator.
"""
from mlx_lm.models.gpt_oss import Model as GptOssModel
# GPT-OSS models don't work reliably with batched generation
if isinstance(model, GptOssModel):
return True
# All tasks can use batch mode - post-processing is per-request
return False