failures

2026-02-04 19:22:39 -05:00 · 2026-02-02 21:33:16 +00:00
parent edb2015607
commit 8aeeb46d2f
1 changed files with 4 additions and 8 deletions
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -101,15 +101,11 @@ def _should_use_serial_processing(
    """
    Determine if a ChatCompletion task requires serial processing.

-    GPT-OSS models have mixed cache types (KVCache + RotatingKVCache) that
-    don't work reliably with BatchGenerator's batched prefill.
+    Currently always returns False - batch mode handles all cases.
+    Post-processing (GPT-OSS, thinking models, tool calls) can be applied
+    per-request to the individual streams from the batch generator.
    """
-    from mlx_lm.models.gpt_oss import Model as GptOssModel
-
-    # GPT-OSS models don't work reliably with batched generation
-    if isinstance(model, GptOssModel):
-        return True
-
+    # All tasks can use batch mode - post-processing is per-request
    return False