From 791001824983e5388941baf5c63d56f6f044cbac Mon Sep 17 00:00:00 2001 From: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> Date: Thu, 2 Jul 2026 09:26:14 +0200 Subject: [PATCH] fix(vllm): non-streaming tool-call regression after #10351 (#10638) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix(vllm): non-streaming tool-call regression after #10351 (native_streaming is a capability flag, not a state flag) #10351 introduced native streaming via `parser.extract_tool_calls_streaming` and gated the post-loop `extract_tool_calls` block on `native_streaming and not native_streaming_error`. That works for streaming requests, but for non-streaming requests the same flag is still True (it only means "the parser can stream", not "we actually streamed"), so the block was skipped and the `elif` cleared `content = ""` — the tool call was silently lost. Symptom: non-streaming chat.completions with `tools=[...]` returns `finish_reason: "stop"` with `content: ""` and no `tool_calls`. Streaming requests are unaffected. Fix: gate both branches on `streaming` too, so the extract_tool_calls block runs for non-streaming requests (and for streaming requests that fell back to the buffered path). Reproduction (vLLM 0.24, Qwen3-Coder-Next-NVFP4, qwen3_coder parser): curl -s -X POST http://localhost:8080/v1/chat/completions \ -H 'Content-Type: application/json' \ -d '{"model":"coder","stream":false, "messages":[{"role":"user","content":"7*8 via calc"}], "tools":[{"type":"function","function":{"name":"calc", "parameters":{"type":"object", "properties":{"expression":{"type":"string"}}}}}]}' Before: finish_reason: "stop", content: "", tool_calls: [] After: finish_reason: "tool_calls", tool_calls[0].function.name: "calc" Streaming path re-verified in the same setup: delta.tool_calls arrives token-by-token, finish_reason: "tool_calls", no raw XML in content. Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> --- backend/python/vllm/backend.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 1e93f26e2..7d85daf94 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -748,7 +748,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # When (A) native streaming ran cleanly, per-delta yields above already # delivered everything — do NOT extract again on the full text or we'd # duplicate content/tool_calls into the final chunk. - if has_tool_parser and not (native_streaming and not native_streaming_error): + # NOTE: `native_streaming` is a capability flag ("streaming parser is + # available"), not a state flag ("streaming actually ran"). For + # non-streaming requests it is still True but the per-delta loop was + # never entered, so we MUST still run extract_tool_calls here. Hence + # the explicit `streaming and …` guard on both branches. + if has_tool_parser and not (streaming and native_streaming and not native_streaming_error): try: tp = tp_instance if tp is None: @@ -770,7 +775,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): )) except Exception as e: print(f"Tool parser error: {e}", file=sys.stderr) - elif native_streaming and not native_streaming_error: + elif streaming and native_streaming and not native_streaming_error: # Per-delta path already emitted content + tool_calls; the final # chat_delta should carry only metadata (token counts, logprobs). content = ""