diff --git a/src/exo/worker/runner/runner.py b/src/exo/worker/runner/runner.py index 3a5619f4..1c278994 100644 --- a/src/exo/worker/runner/runner.py +++ b/src/exo/worker/runner/runner.py @@ -101,15 +101,11 @@ def _should_use_serial_processing( """ Determine if a ChatCompletion task requires serial processing. - GPT-OSS models have mixed cache types (KVCache + RotatingKVCache) that - don't work reliably with BatchGenerator's batched prefill. + Currently always returns False - batch mode handles all cases. + Post-processing (GPT-OSS, thinking models, tool calls) can be applied + per-request to the individual streams from the batch generator. """ - from mlx_lm.models.gpt_oss import Model as GptOssModel - - # GPT-OSS models don't work reliably with batched generation - if isinstance(model, GptOssModel): - return True - + # All tasks can use batch mode - post-processing is per-request return False