mirror of
https://github.com/exo-explore/exo.git
synced 2026-02-04 19:22:39 -05:00
failures
This commit is contained in:
@@ -101,15 +101,11 @@ def _should_use_serial_processing(
|
||||
"""
|
||||
Determine if a ChatCompletion task requires serial processing.
|
||||
|
||||
GPT-OSS models have mixed cache types (KVCache + RotatingKVCache) that
|
||||
don't work reliably with BatchGenerator's batched prefill.
|
||||
Currently always returns False - batch mode handles all cases.
|
||||
Post-processing (GPT-OSS, thinking models, tool calls) can be applied
|
||||
per-request to the individual streams from the batch generator.
|
||||
"""
|
||||
from mlx_lm.models.gpt_oss import Model as GptOssModel
|
||||
|
||||
# GPT-OSS models don't work reliably with batched generation
|
||||
if isinstance(model, GptOssModel):
|
||||
return True
|
||||
|
||||
# All tasks can use batch mode - post-processing is per-request
|
||||
return False
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user