From c99188f10694d8f6af7adaf5be5079dd1e84f7e6 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sun, 12 Apr 2026 09:15:16 +0000
Subject: [PATCH] fix(vllm): tool parser constructor compat + e2e tool calling
 test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Concrete vLLM tool parsers override the abstract base's __init__ and
drop the tools kwarg (e.g. Hermes2ProToolParser only takes tokenizer).
Instantiating with tools= raised TypeError which was silently caught,
leaving chat_deltas.tool_calls empty.

Retry the constructor without the tools kwarg on TypeError — tools
aren't required by these parsers since extract_tool_calls finds tool
syntax in the raw model output directly.

Validated with Qwen/Qwen2.5-0.5B-Instruct + hermes parser on CPU:
the backend correctly returns ToolCallDelta{name='get_weather',
arguments='{"location": "Paris, France"}'} in ChatDelta.

test_tool_calls.py is a standalone smoke test that spawns the gRPC
backend, sends a chat completion with tools, and asserts the response
contains a structured tool call.
---
 backend/python/vllm/backend.py         |   8 +-
 backend/python/vllm/test_tool_calls.py | 134 +++++++++++++++++++++++++
 2 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 backend/python/vllm/test_tool_calls.py

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index cfb69a684..95ae95a9d 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -453,7 +453,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
         if self.tool_parser_cls and request.Tools:
             try:
                 tools = json.loads(request.Tools)
-                tp = self.tool_parser_cls(self.tokenizer, tools=tools)
+                # Some concrete parsers only accept the tokenizer; only the
+                # abstract base declares the tools kwarg. Try with tools first,
+                # fall back to tokenizer-only.
+                try:
+                    tp = self.tool_parser_cls(self.tokenizer, tools=tools)
+                except TypeError:
+                    tp = self.tool_parser_cls(self.tokenizer)
                 info = tp.extract_tool_calls(content, request=None)
                 if info.tools_called:
                     content = info.content or ""
diff --git a/backend/python/vllm/test_tool_calls.py b/backend/python/vllm/test_tool_calls.py
new file mode 100644
index 000000000..12b36f6f2
--- /dev/null
+++ b/backend/python/vllm/test_tool_calls.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""End-to-end CPU tool-calling test for the vllm backend.
+
+Loads Qwen2.5-0.5B-Instruct with the hermes tool parser, sends a chat
+completion with a `get_weather` tool, and checks that the reply's
+ChatDelta contains a ToolCallDelta for that function.
+"""
+import argparse
+import json
+import os
+import subprocess
+import sys
+import time
+
+import grpc
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, HERE)
+
+import backend_pb2
+import backend_pb2_grpc
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="Qwen/Qwen2.5-0.5B-Instruct")
+    parser.add_argument("--addr", default="127.0.0.1:50098")
+    args = parser.parse_args()
+
+    env = os.environ.copy()
+    env.setdefault("VLLM_TARGET_DEVICE", "cpu")
+    env.setdefault("VLLM_CPU_KVCACHE_SPACE", "4")
+
+    server_proc = subprocess.Popen(
+        [sys.executable, os.path.join(HERE, "backend.py"), "--addr", args.addr],
+        env=env,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+    )
+
+    try:
+        deadline = time.time() + 30
+        channel = None
+        while time.time() < deadline:
+            try:
+                channel = grpc.insecure_channel(args.addr)
+                grpc.channel_ready_future(channel).result(timeout=2)
+                break
+            except Exception:
+                time.sleep(0.5)
+        if channel is None:
+            raise RuntimeError("backend server did not start in time")
+
+        stub = backend_pb2_grpc.BackendStub(channel)
+
+        print(f"[test] LoadModel({args.model}) with hermes tool_parser", flush=True)
+        load_resp = stub.LoadModel(backend_pb2.ModelOptions(
+            Model=args.model,
+            ContextSize=2048,
+            Options=["tool_parser:hermes"],
+        ), timeout=900)
+        assert load_resp.success, f"LoadModel failed: {load_resp.message}"
+
+        tools = [{
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                    },
+                    "required": ["location"],
+                },
+            },
+        }]
+
+        messages = [
+            backend_pb2.Message(role="system", content="You are a helpful assistant. Use the get_weather tool when the user asks about weather."),
+            backend_pb2.Message(role="user", content="What's the weather like in Paris, France?"),
+        ]
+
+        print("[test] Predict with tool definitions", flush=True)
+        reply = stub.Predict(backend_pb2.PredictOptions(
+            Messages=messages,
+            Tools=json.dumps(tools),
+            ToolChoice="auto",
+            UseTokenizerTemplate=True,
+            Tokens=200,
+            Temperature=0.1,
+        ), timeout=600)
+
+        text = reply.message.decode("utf-8")
+        print(f"[test] Raw message: {text!r}", flush=True)
+        print(f"[test] prompt_tokens={reply.prompt_tokens} tokens={reply.tokens}", flush=True)
+        print(f"[test] chat_deltas count: {len(reply.chat_deltas)}", flush=True)
+
+        tool_calls_seen = []
+        for delta in reply.chat_deltas:
+            print(f"[test] delta.content={delta.content!r}", flush=True)
+            print(f"[test] delta.reasoning_content={delta.reasoning_content!r}", flush=True)
+            for tc in delta.tool_calls:
+                print(f"[test] tool_call idx={tc.index} id={tc.id!r} name={tc.name!r} args={tc.arguments!r}", flush=True)
+                tool_calls_seen.append(tc)
+
+        # Verify at least one tool call was extracted
+        assert len(tool_calls_seen) > 0, (
+            "No tool calls in ChatDelta. "
+            f"Raw text was: {text!r}"
+        )
+        assert any(tc.name == "get_weather" for tc in tool_calls_seen), (
+            f"Expected get_weather tool call, got: {[tc.name for tc in tool_calls_seen]}"
+        )
+
+        print("[test] Free", flush=True)
+        stub.Free(backend_pb2.HealthMessage(), timeout=30)
+
+        print("[test] PASS", flush=True)
+        return 0
+
+    finally:
+        try:
+            server_proc.terminate()
+            server_proc.wait(timeout=10)
+        except Exception:
+            server_proc.kill()
+
+
+if __name__ == "__main__":
+    sys.exit(main())