mirror of
https://github.com/exo-explore/exo.git
synced 2026-05-19 04:05:23 -04:00
## Motivation
Streaming /v1/chat/completions responses emitted null for tool_calls,
function_call, name, and tool_call_id in every delta chunk. The OpenAI
streaming spec marks these fields as non-nullable — they must either
carry a
real value or be absent entirely. Spec-correct clients doing
delta.get("tool_calls", []) receive None and crash with 'NoneType'
object is
not iterable.
Root cause: the streaming serialisation path called model_dump_json()
without
exclude_none=True, while the request-parsing path already used it
correctly.
Three call sites in chat_completions.py and two in responses.py were
affected.
## Testing
Before — every delta carries explicit nulls:
$ curl -sN -X POST http://localhost:52415/v1/chat/completions \
-H 'Content-Type: application/json' \
-d
'{"model":"mlx-community/Qwen3.5-2B-MLX-8bit","messages":[{"role":"user","
content":"hi"}],"max_tokens":3,"stream":true}' \
| grep "^data: "
data:
{"id":"7c4dae10-...","choices":[{"index":0,"delta":{"role":"assistant","c
ontent":null,"reasoning_content":"Okay","name":null,"tool_calls":null,"tool_cal
l_id":null,"function_call":null},"logprobs":null,"finish_reason":null,"usage":n
ull}],"usage":null,"service_tier":null}
data:
{"id":"7c4dae10-...","choices":[{"index":0,"delta":{"role":"assistant","c
ontent":null,"reasoning_content":",","name":null,"tool_calls":null,"tool_call_i
d":null,"function_call":null},"logprobs":null,"finish_reason":null,"usage":null
}],"usage":null,"service_tier":null}
data:
{"id":"7c4dae10-...","choices":[{"index":0,"delta":{"role":"assistant","c
ontent":"
the","reasoning_content":null,"name":null,"tool_calls":null,"tool_cal
l_id":null,"function_call":null},"logprobs":null,"finish_reason":"length","usag
e":{"prompt_tokens":11,...}}],"usage":null,"service_tier":null}
data: [DONE]
After — only populated fields are emitted:
data:
{"id":"demo","object":"chat.completion","created":...,"model":"mlx-commun
ity/Qwen3.5-2B-MLX-8bit","choices":[{"index":0,"delta":{"role":"assistant","rea
soning_content":"Okay"}}]}
data:
{"id":"demo","object":"chat.completion","created":...,"model":"mlx-commun
ity/Qwen3.5-2B-MLX-8bit","choices":[{"index":0,"delta":{"role":"assistant","rea
soning_content":","}}]}
data:
{"id":"demo","object":"chat.completion","created":...,"model":"mlx-commun
ity/Qwen3.5-2B-MLX-8bit","choices":[{"index":0,"delta":{"role":"assistant","con
tent":"
the"},"finish_reason":"length"}],"usage":{"prompt_tokens":11,"completio
n_tokens":3,"total_tokens":14,...}}
data: [DONE]
This commit is contained in:
@@ -238,7 +238,7 @@ async def generate_chat_stream(
|
||||
code=500,
|
||||
)
|
||||
)
|
||||
yield f"data: {error_response.model_dump_json()}\n\n"
|
||||
yield f"data: {error_response.model_dump_json(exclude_none=True)}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
return
|
||||
|
||||
@@ -269,7 +269,7 @@ async def generate_chat_stream(
|
||||
],
|
||||
usage=last_usage,
|
||||
)
|
||||
yield f"data: {tool_response.model_dump_json()}\n\n"
|
||||
yield f"data: {tool_response.model_dump_json(exclude_none=True)}\n\n"
|
||||
if chunk.stats is not None:
|
||||
yield f": generation_stats {chunk.stats.model_dump_json()}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
@@ -283,7 +283,7 @@ async def generate_chat_stream(
|
||||
chunk_response = chunk_response.model_copy(
|
||||
update={"usage": last_usage}
|
||||
)
|
||||
yield f"data: {chunk_response.model_dump_json()}\n\n"
|
||||
yield f"data: {chunk_response.model_dump_json(exclude_none=True)}\n\n"
|
||||
|
||||
if chunk.finish_reason is not None:
|
||||
if chunk.stats is not None:
|
||||
@@ -379,5 +379,5 @@ async def collect_chat_response(
|
||||
)
|
||||
],
|
||||
usage=last_usage,
|
||||
).model_dump_json()
|
||||
).model_dump_json(exclude_none=True)
|
||||
return
|
||||
|
||||
@@ -101,7 +101,7 @@ def _build_response_usage(usage: Usage) -> ResponseUsage:
|
||||
|
||||
def _format_sse(event: ResponsesStreamEvent) -> str:
|
||||
"""Format a streaming event as an SSE message."""
|
||||
return f"event: {event.type}\ndata: {event.model_dump_json()}\n\n"
|
||||
return f"event: {event.type}\ndata: {event.model_dump_json(exclude_none=True)}\n\n"
|
||||
|
||||
|
||||
def _extract_content(content: str | list[ResponseContentPart]) -> str:
|
||||
@@ -471,7 +471,7 @@ async def collect_responses_response(
|
||||
output=output,
|
||||
output_text=accumulated_text,
|
||||
usage=usage,
|
||||
).model_dump_json()
|
||||
).model_dump_json(exclude_none=True)
|
||||
return
|
||||
|
||||
|
||||
|
||||
195
src/exo/api/tests/test_chat_completions_stream.py
Normal file
195
src/exo/api/tests/test_chat_completions_stream.py
Normal file
@@ -0,0 +1,195 @@
|
||||
# pyright: reportAny=false
|
||||
"""Tests asserting OpenAI-spec wire shape for /v1/chat/completions deltas."""
|
||||
|
||||
import json
|
||||
from collections.abc import AsyncGenerator
|
||||
from typing import Any
|
||||
|
||||
from exo.api.adapters.chat_completions import (
|
||||
collect_chat_response,
|
||||
generate_chat_stream,
|
||||
)
|
||||
from exo.api.types import (
|
||||
CompletionTokensDetails,
|
||||
PromptTokensDetails,
|
||||
ToolCallItem,
|
||||
Usage,
|
||||
)
|
||||
from exo.shared.types.chunks import (
|
||||
ErrorChunk,
|
||||
PrefillProgressChunk,
|
||||
TokenChunk,
|
||||
ToolCallChunk,
|
||||
)
|
||||
from exo.shared.types.common import CommandId, ModelId
|
||||
|
||||
_TEST_MODEL = ModelId("test-model")
|
||||
_NULLABLE_DELTA_FIELDS = {"content", "refusal"}
|
||||
|
||||
|
||||
def _make_usage(prompt_tokens: int = 1, completion_tokens: int = 1) -> Usage:
|
||||
return Usage(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=prompt_tokens + completion_tokens,
|
||||
prompt_tokens_details=PromptTokensDetails(),
|
||||
completion_tokens_details=CompletionTokensDetails(),
|
||||
)
|
||||
|
||||
|
||||
async def _stream(
|
||||
chunks: list[PrefillProgressChunk | ErrorChunk | ToolCallChunk | TokenChunk],
|
||||
) -> AsyncGenerator[
|
||||
PrefillProgressChunk | ErrorChunk | ToolCallChunk | TokenChunk, None
|
||||
]:
|
||||
for chunk in chunks:
|
||||
yield chunk
|
||||
|
||||
|
||||
def _parse_data_events(lines: list[str]) -> list[dict[str, Any]]:
|
||||
events: list[dict[str, Any]] = []
|
||||
for line in lines:
|
||||
for sub in line.split("\n"):
|
||||
if sub.startswith("data: ") and not sub.endswith("[DONE]"):
|
||||
events.append(json.loads(sub[len("data: ") :]))
|
||||
return events
|
||||
|
||||
|
||||
def _assert_delta_spec_compliant(delta: dict[str, Any]) -> None:
|
||||
"""Reject any null delta key the OpenAI spec doesn't allow to be null."""
|
||||
for key, value in delta.items():
|
||||
if value is None and key not in _NULLABLE_DELTA_FIELDS:
|
||||
raise AssertionError(
|
||||
f"delta.{key} is null but spec requires it to be absent or a value; "
|
||||
f"full delta={delta!r}"
|
||||
)
|
||||
|
||||
|
||||
class TestTokenStreamDeltaShape:
|
||||
async def test_token_chunk_delta_has_no_disallowed_nulls(self):
|
||||
chunks: list[PrefillProgressChunk | ErrorChunk | ToolCallChunk | TokenChunk] = [
|
||||
TokenChunk(
|
||||
model=_TEST_MODEL,
|
||||
token_id=1,
|
||||
text="Hello",
|
||||
usage=None,
|
||||
),
|
||||
TokenChunk(
|
||||
model=_TEST_MODEL,
|
||||
token_id=2,
|
||||
text=" world",
|
||||
usage=_make_usage(),
|
||||
finish_reason="stop",
|
||||
),
|
||||
]
|
||||
lines: list[str] = []
|
||||
async for event in generate_chat_stream(
|
||||
CommandId("test-cmd-token"), _stream(chunks)
|
||||
):
|
||||
lines.append(event)
|
||||
|
||||
events = _parse_data_events(lines)
|
||||
assert len(events) == 2
|
||||
for event in events:
|
||||
delta = event["choices"][0]["delta"]
|
||||
_assert_delta_spec_compliant(delta)
|
||||
assert "tool_calls" not in delta or isinstance(delta["tool_calls"], list)
|
||||
assert "function_call" not in delta
|
||||
assert "name" not in delta
|
||||
assert "tool_call_id" not in delta
|
||||
|
||||
async def test_thinking_chunk_delta_has_no_disallowed_nulls(self):
|
||||
chunks: list[PrefillProgressChunk | ErrorChunk | ToolCallChunk | TokenChunk] = [
|
||||
TokenChunk(
|
||||
model=_TEST_MODEL,
|
||||
token_id=1,
|
||||
text="Hmm",
|
||||
usage=None,
|
||||
is_thinking=True,
|
||||
),
|
||||
]
|
||||
lines: list[str] = []
|
||||
async for event in generate_chat_stream(
|
||||
CommandId("test-cmd-thinking"), _stream(chunks)
|
||||
):
|
||||
lines.append(event)
|
||||
|
||||
events = _parse_data_events(lines)
|
||||
assert len(events) == 1
|
||||
delta = events[0]["choices"][0]["delta"]
|
||||
_assert_delta_spec_compliant(delta)
|
||||
assert delta.get("reasoning_content") == "Hmm"
|
||||
assert "content" not in delta
|
||||
|
||||
|
||||
class TestToolCallStreamDeltaShape:
|
||||
async def test_tool_call_chunk_delta_has_array_tool_calls(self):
|
||||
chunks: list[PrefillProgressChunk | ErrorChunk | ToolCallChunk | TokenChunk] = [
|
||||
ToolCallChunk(
|
||||
model=_TEST_MODEL,
|
||||
tool_calls=[
|
||||
ToolCallItem(id="call_1", name="get_weather", arguments="{}"),
|
||||
],
|
||||
usage=_make_usage(),
|
||||
),
|
||||
]
|
||||
lines: list[str] = []
|
||||
async for event in generate_chat_stream(
|
||||
CommandId("test-cmd-tool"), _stream(chunks)
|
||||
):
|
||||
lines.append(event)
|
||||
|
||||
events = _parse_data_events(lines)
|
||||
assert len(events) == 1
|
||||
delta = events[0]["choices"][0]["delta"]
|
||||
_assert_delta_spec_compliant(delta)
|
||||
assert isinstance(delta["tool_calls"], list)
|
||||
assert delta["tool_calls"][0]["function"]["name"] == "get_weather"
|
||||
|
||||
|
||||
class TestErrorStreamShape:
|
||||
async def test_error_chunk_response_has_no_nulls(self):
|
||||
chunks: list[PrefillProgressChunk | ErrorChunk | ToolCallChunk | TokenChunk] = [
|
||||
ErrorChunk(model=_TEST_MODEL, error_message="boom"),
|
||||
]
|
||||
lines: list[str] = []
|
||||
async for event in generate_chat_stream(
|
||||
CommandId("test-cmd-err"), _stream(chunks)
|
||||
):
|
||||
lines.append(event)
|
||||
|
||||
events = _parse_data_events(lines)
|
||||
assert len(events) == 1
|
||||
assert events[0]["error"]["message"] == "boom"
|
||||
for value in events[0]["error"].values():
|
||||
assert value is not None
|
||||
|
||||
|
||||
class TestNonStreamingResponseShape:
|
||||
async def test_collected_response_message_has_no_disallowed_nulls(self):
|
||||
chunks: list[PrefillProgressChunk | ErrorChunk | ToolCallChunk | TokenChunk] = [
|
||||
TokenChunk(
|
||||
model=_TEST_MODEL,
|
||||
token_id=1,
|
||||
text="Hello",
|
||||
usage=_make_usage(),
|
||||
finish_reason="stop",
|
||||
),
|
||||
]
|
||||
parts: list[str] = []
|
||||
async for part in collect_chat_response(
|
||||
CommandId("test-cmd-nonstream"), _stream(chunks)
|
||||
):
|
||||
parts.append(part)
|
||||
|
||||
assert len(parts) == 1
|
||||
payload = json.loads(parts[0])
|
||||
message = payload["choices"][0]["message"]
|
||||
for key, value in message.items():
|
||||
if value is None:
|
||||
assert key in {"content", "refusal", "reasoning_content"}, (
|
||||
f"non-streaming message.{key} is null but spec disallows it"
|
||||
)
|
||||
assert "function_call" not in message
|
||||
assert "name" not in message
|
||||
assert "tool_call_id" not in message
|
||||
Reference in New Issue
Block a user