From 8aeeb46d2f40e4f26600d7bb9fa4355cb7d8d4ff Mon Sep 17 00:00:00 2001 From: Ryuichi Leo Takashige Date: Mon, 2 Feb 2026 21:33:16 +0000 Subject: [PATCH] failures --- src/exo/worker/runner/runner.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/exo/worker/runner/runner.py b/src/exo/worker/runner/runner.py index 3a5619f4..1c278994 100644 --- a/src/exo/worker/runner/runner.py +++ b/src/exo/worker/runner/runner.py @@ -101,15 +101,11 @@ def _should_use_serial_processing( """ Determine if a ChatCompletion task requires serial processing. - GPT-OSS models have mixed cache types (KVCache + RotatingKVCache) that - don't work reliably with BatchGenerator's batched prefill. + Currently always returns False - batch mode handles all cases. + Post-processing (GPT-OSS, thinking models, tool calls) can be applied + per-request to the individual streams from the batch generator. """ - from mlx_lm.models.gpt_oss import Model as GptOssModel - - # GPT-OSS models don't work reliably with batched generation - if isinstance(model, GptOssModel): - return True - + # All tasks can use batch mode - post-processing is per-request return False