From c99188f10694d8f6af7adaf5be5079dd1e84f7e6 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 12 Apr 2026 09:15:16 +0000 Subject: [PATCH] fix(vllm): tool parser constructor compat + e2e tool calling test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Concrete vLLM tool parsers override the abstract base's __init__ and drop the tools kwarg (e.g. Hermes2ProToolParser only takes tokenizer). Instantiating with tools= raised TypeError which was silently caught, leaving chat_deltas.tool_calls empty. Retry the constructor without the tools kwarg on TypeError — tools aren't required by these parsers since extract_tool_calls finds tool syntax in the raw model output directly. Validated with Qwen/Qwen2.5-0.5B-Instruct + hermes parser on CPU: the backend correctly returns ToolCallDelta{name='get_weather', arguments='{"location": "Paris, France"}'} in ChatDelta. test_tool_calls.py is a standalone smoke test that spawns the gRPC backend, sends a chat completion with tools, and asserts the response contains a structured tool call. --- backend/python/vllm/backend.py | 8 +- backend/python/vllm/test_tool_calls.py | 134 +++++++++++++++++++++++++ 2 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 backend/python/vllm/test_tool_calls.py diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index cfb69a684..95ae95a9d 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -453,7 +453,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if self.tool_parser_cls and request.Tools: try: tools = json.loads(request.Tools) - tp = self.tool_parser_cls(self.tokenizer, tools=tools) + # Some concrete parsers only accept the tokenizer; only the + # abstract base declares the tools kwarg. Try with tools first, + # fall back to tokenizer-only. + try: + tp = self.tool_parser_cls(self.tokenizer, tools=tools) + except TypeError: + tp = self.tool_parser_cls(self.tokenizer) info = tp.extract_tool_calls(content, request=None) if info.tools_called: content = info.content or "" diff --git a/backend/python/vllm/test_tool_calls.py b/backend/python/vllm/test_tool_calls.py new file mode 100644 index 000000000..12b36f6f2 --- /dev/null +++ b/backend/python/vllm/test_tool_calls.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +"""End-to-end CPU tool-calling test for the vllm backend. + +Loads Qwen2.5-0.5B-Instruct with the hermes tool parser, sends a chat +completion with a `get_weather` tool, and checks that the reply's +ChatDelta contains a ToolCallDelta for that function. +""" +import argparse +import json +import os +import subprocess +import sys +import time + +import grpc + +HERE = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, HERE) + +import backend_pb2 +import backend_pb2_grpc + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="Qwen/Qwen2.5-0.5B-Instruct") + parser.add_argument("--addr", default="127.0.0.1:50098") + args = parser.parse_args() + + env = os.environ.copy() + env.setdefault("VLLM_TARGET_DEVICE", "cpu") + env.setdefault("VLLM_CPU_KVCACHE_SPACE", "4") + + server_proc = subprocess.Popen( + [sys.executable, os.path.join(HERE, "backend.py"), "--addr", args.addr], + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) + + try: + deadline = time.time() + 30 + channel = None + while time.time() < deadline: + try: + channel = grpc.insecure_channel(args.addr) + grpc.channel_ready_future(channel).result(timeout=2) + break + except Exception: + time.sleep(0.5) + if channel is None: + raise RuntimeError("backend server did not start in time") + + stub = backend_pb2_grpc.BackendStub(channel) + + print(f"[test] LoadModel({args.model}) with hermes tool_parser", flush=True) + load_resp = stub.LoadModel(backend_pb2.ModelOptions( + Model=args.model, + ContextSize=2048, + Options=["tool_parser:hermes"], + ), timeout=900) + assert load_resp.success, f"LoadModel failed: {load_resp.message}" + + tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + }, + "required": ["location"], + }, + }, + }] + + messages = [ + backend_pb2.Message(role="system", content="You are a helpful assistant. Use the get_weather tool when the user asks about weather."), + backend_pb2.Message(role="user", content="What's the weather like in Paris, France?"), + ] + + print("[test] Predict with tool definitions", flush=True) + reply = stub.Predict(backend_pb2.PredictOptions( + Messages=messages, + Tools=json.dumps(tools), + ToolChoice="auto", + UseTokenizerTemplate=True, + Tokens=200, + Temperature=0.1, + ), timeout=600) + + text = reply.message.decode("utf-8") + print(f"[test] Raw message: {text!r}", flush=True) + print(f"[test] prompt_tokens={reply.prompt_tokens} tokens={reply.tokens}", flush=True) + print(f"[test] chat_deltas count: {len(reply.chat_deltas)}", flush=True) + + tool_calls_seen = [] + for delta in reply.chat_deltas: + print(f"[test] delta.content={delta.content!r}", flush=True) + print(f"[test] delta.reasoning_content={delta.reasoning_content!r}", flush=True) + for tc in delta.tool_calls: + print(f"[test] tool_call idx={tc.index} id={tc.id!r} name={tc.name!r} args={tc.arguments!r}", flush=True) + tool_calls_seen.append(tc) + + # Verify at least one tool call was extracted + assert len(tool_calls_seen) > 0, ( + "No tool calls in ChatDelta. " + f"Raw text was: {text!r}" + ) + assert any(tc.name == "get_weather" for tc in tool_calls_seen), ( + f"Expected get_weather tool call, got: {[tc.name for tc in tool_calls_seen]}" + ) + + print("[test] Free", flush=True) + stub.Free(backend_pb2.HealthMessage(), timeout=30) + + print("[test] PASS", flush=True) + return 0 + + finally: + try: + server_proc.terminate() + server_proc.wait(timeout=10) + except Exception: + server_proc.kill() + + +if __name__ == "__main__": + sys.exit(main())