diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 1e93f26e2..7d85daf94 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -748,7 +748,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # When (A) native streaming ran cleanly, per-delta yields above already # delivered everything — do NOT extract again on the full text or we'd # duplicate content/tool_calls into the final chunk. - if has_tool_parser and not (native_streaming and not native_streaming_error): + # NOTE: `native_streaming` is a capability flag ("streaming parser is + # available"), not a state flag ("streaming actually ran"). For + # non-streaming requests it is still True but the per-delta loop was + # never entered, so we MUST still run extract_tool_calls here. Hence + # the explicit `streaming and …` guard on both branches. + if has_tool_parser and not (streaming and native_streaming and not native_streaming_error): try: tp = tp_instance if tp is None: @@ -770,7 +775,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): )) except Exception as e: print(f"Tool parser error: {e}", file=sys.stderr) - elif native_streaming and not native_streaming_error: + elif streaming and native_streaming and not native_streaming_error: # Per-delta path already emitted content + tool_calls; the final # chat_delta should carry only metadata (token counts, logprobs). content = ""