test(e2e-backends): add tools capability + HF model name support

Extends tests/e2e-backends to cover backends that:
- Resolve HuggingFace model ids natively (vllm, vllm-omni) instead of
  loading a local file: BACKEND_TEST_MODEL_NAME is passed verbatim as
  ModelOptions.Model with no download/ModelFile.
- Parse tool calls into ChatDelta.tool_calls: new "tools" capability
  sends a Predict with a get_weather function definition and asserts
  the Reply contains a matching ToolCallDelta. Uses UseTokenizerTemplate
  with OpenAI-style Messages so the backend can wire tools into the
  model's chat template.
- Need backend-specific Options[]: BACKEND_TEST_OPTIONS lets a test set
  e.g. "tool_parser:hermes,reasoning_parser:qwen3" at LoadModel time.

Adds make target test-extra-backend-vllm that:
- docker-build-vllm
- loads Qwen/Qwen2.5-0.5B-Instruct
- runs health,load,predict,stream,tools with tool_parser:hermes

Drops backend/python/vllm/test_{cpu_inference,tool_calls}.py — those
standalone scripts were scaffolding used while bringing up the Python
backend; the e2e-backends harness now covers the same ground uniformly
alongside llama-cpp and ik-llama-cpp.
This commit is contained in:
Ettore Di Giacinto
2026-04-12 14:51:58 +00:00
parent 034a60bf76
commit e7f406169a
4 changed files with 141 additions and 247 deletions

View File

@@ -1,101 +0,0 @@
#!/usr/bin/env python3
"""End-to-end CPU inference smoke test for the vllm backend.
Spawns the gRPC backend server, loads a small Qwen model, runs Predict,
TokenizeString, and Free, and verifies non-empty output.
Usage:
python test_cpu_inference.py [--model MODEL_ID] [--addr HOST:PORT]
Defaults to Qwen/Qwen2.5-0.5B-Instruct (Qwen3.5-0.6B is not yet published
on the HuggingFace hub at the time of writing).
"""
import argparse
import os
import subprocess
import sys
import time
import grpc
# Make sibling backend_pb2 importable
HERE = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, HERE)
import backend_pb2
import backend_pb2_grpc
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", default=os.environ.get("TEST_MODEL", "Qwen/Qwen2.5-0.5B-Instruct"))
parser.add_argument("--addr", default="127.0.0.1:50099")
parser.add_argument("--prompt", default="Hello, how are you?")
args = parser.parse_args()
# Force CPU mode for vLLM
env = os.environ.copy()
env.setdefault("VLLM_TARGET_DEVICE", "cpu")
env.setdefault("VLLM_CPU_KVCACHE_SPACE", "4")
server_proc = subprocess.Popen(
[sys.executable, os.path.join(HERE, "backend.py"), "--addr", args.addr],
env=env,
stdout=sys.stdout,
stderr=sys.stderr,
)
try:
# Wait for the server to come up
deadline = time.time() + 30
channel = None
while time.time() < deadline:
try:
channel = grpc.insecure_channel(args.addr)
grpc.channel_ready_future(channel).result(timeout=2)
break
except Exception:
time.sleep(0.5)
if channel is None:
raise RuntimeError("backend server did not start in time")
stub = backend_pb2_grpc.BackendStub(channel)
print(f"[test] LoadModel({args.model})", flush=True)
load_resp = stub.LoadModel(backend_pb2.ModelOptions(
Model=args.model,
ContextSize=2048,
), timeout=900)
assert load_resp.success, f"LoadModel failed: {load_resp.message}"
print(f"[test] Predict prompt={args.prompt!r}", flush=True)
reply = stub.Predict(backend_pb2.PredictOptions(
Prompt=args.prompt,
Tokens=64,
Temperature=0.7,
TopP=0.9,
), timeout=600)
text = reply.message.decode("utf-8")
print(f"[test] Predict output: {text!r}", flush=True)
assert text.strip(), "Predict returned empty text"
print("[test] TokenizeString", flush=True)
tok_resp = stub.TokenizeString(backend_pb2.PredictOptions(Prompt="hello world"), timeout=30)
print(f"[test] TokenizeString length={tok_resp.length}", flush=True)
assert tok_resp.length > 0
print("[test] Free", flush=True)
free_resp = stub.Free(backend_pb2.MemoryUsageData(), timeout=30)
assert free_resp.success, f"Free failed: {free_resp.message}"
print("[test] PASS", flush=True)
finally:
server_proc.terminate()
try:
server_proc.wait(timeout=10)
except subprocess.TimeoutExpired:
server_proc.kill()
if __name__ == "__main__":
main()

View File

@@ -1,134 +0,0 @@
#!/usr/bin/env python3
"""End-to-end CPU tool-calling test for the vllm backend.
Loads Qwen2.5-0.5B-Instruct with the hermes tool parser, sends a chat
completion with a `get_weather` tool, and checks that the reply's
ChatDelta contains a ToolCallDelta for that function.
"""
import argparse
import json
import os
import subprocess
import sys
import time
import grpc
HERE = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, HERE)
import backend_pb2
import backend_pb2_grpc
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="Qwen/Qwen2.5-0.5B-Instruct")
parser.add_argument("--addr", default="127.0.0.1:50098")
args = parser.parse_args()
env = os.environ.copy()
env.setdefault("VLLM_TARGET_DEVICE", "cpu")
env.setdefault("VLLM_CPU_KVCACHE_SPACE", "4")
server_proc = subprocess.Popen(
[sys.executable, os.path.join(HERE, "backend.py"), "--addr", args.addr],
env=env,
stdout=sys.stdout,
stderr=sys.stderr,
)
try:
deadline = time.time() + 30
channel = None
while time.time() < deadline:
try:
channel = grpc.insecure_channel(args.addr)
grpc.channel_ready_future(channel).result(timeout=2)
break
except Exception:
time.sleep(0.5)
if channel is None:
raise RuntimeError("backend server did not start in time")
stub = backend_pb2_grpc.BackendStub(channel)
print(f"[test] LoadModel({args.model}) with hermes tool_parser", flush=True)
load_resp = stub.LoadModel(backend_pb2.ModelOptions(
Model=args.model,
ContextSize=2048,
Options=["tool_parser:hermes"],
), timeout=900)
assert load_resp.success, f"LoadModel failed: {load_resp.message}"
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
"required": ["location"],
},
},
}]
messages = [
backend_pb2.Message(role="system", content="You are a helpful assistant. Use the get_weather tool when the user asks about weather."),
backend_pb2.Message(role="user", content="What's the weather like in Paris, France?"),
]
print("[test] Predict with tool definitions", flush=True)
reply = stub.Predict(backend_pb2.PredictOptions(
Messages=messages,
Tools=json.dumps(tools),
ToolChoice="auto",
UseTokenizerTemplate=True,
Tokens=200,
Temperature=0.1,
), timeout=600)
text = reply.message.decode("utf-8")
print(f"[test] Raw message: {text!r}", flush=True)
print(f"[test] prompt_tokens={reply.prompt_tokens} tokens={reply.tokens}", flush=True)
print(f"[test] chat_deltas count: {len(reply.chat_deltas)}", flush=True)
tool_calls_seen = []
for delta in reply.chat_deltas:
print(f"[test] delta.content={delta.content!r}", flush=True)
print(f"[test] delta.reasoning_content={delta.reasoning_content!r}", flush=True)
for tc in delta.tool_calls:
print(f"[test] tool_call idx={tc.index} id={tc.id!r} name={tc.name!r} args={tc.arguments!r}", flush=True)
tool_calls_seen.append(tc)
# Verify at least one tool call was extracted
assert len(tool_calls_seen) > 0, (
"No tool calls in ChatDelta. "
f"Raw text was: {text!r}"
)
assert any(tc.name == "get_weather" for tc in tool_calls_seen), (
f"Expected get_weather tool call, got: {[tc.name for tc in tool_calls_seen]}"
)
print("[test] Free", flush=True)
stub.Free(backend_pb2.HealthMessage(), timeout=30)
print("[test] PASS", flush=True)
return 0
finally:
try:
server_proc.terminate()
server_proc.wait(timeout=10)
except Exception:
server_proc.kill()
if __name__ == "__main__":
sys.exit(main())