From e7f406169a765b98463eecb0dad78f3909ba39d2 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 12 Apr 2026 14:51:58 +0000 Subject: [PATCH] test(e2e-backends): add tools capability + HF model name support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends tests/e2e-backends to cover backends that: - Resolve HuggingFace model ids natively (vllm, vllm-omni) instead of loading a local file: BACKEND_TEST_MODEL_NAME is passed verbatim as ModelOptions.Model with no download/ModelFile. - Parse tool calls into ChatDelta.tool_calls: new "tools" capability sends a Predict with a get_weather function definition and asserts the Reply contains a matching ToolCallDelta. Uses UseTokenizerTemplate with OpenAI-style Messages so the backend can wire tools into the model's chat template. - Need backend-specific Options[]: BACKEND_TEST_OPTIONS lets a test set e.g. "tool_parser:hermes,reasoning_parser:qwen3" at LoadModel time. Adds make target test-extra-backend-vllm that: - docker-build-vllm - loads Qwen/Qwen2.5-0.5B-Instruct - runs health,load,predict,stream,tools with tool_parser:hermes Drops backend/python/vllm/test_{cpu_inference,tool_calls}.py — those standalone scripts were scaffolding used while bringing up the Python backend; the e2e-backends harness now covers the same ground uniformly alongside llama-cpp and ik-llama-cpp. --- Makefile | 21 +++- backend/python/vllm/test_cpu_inference.py | 101 ---------------- backend/python/vllm/test_tool_calls.py | 134 ---------------------- tests/e2e-backends/backend_test.go | 132 +++++++++++++++++++-- 4 files changed, 141 insertions(+), 247 deletions(-) delete mode 100644 backend/python/vllm/test_cpu_inference.py delete mode 100644 backend/python/vllm/test_tool_calls.py diff --git a/Makefile b/Makefile index 6dce83efd..7f61666f5 100644 --- a/Makefile +++ b/Makefile @@ -466,8 +466,14 @@ test-extra: prepare-test-extra ## BACKEND_IMAGE Required. Docker image to test, e.g. local-ai-backend:llama-cpp. ## BACKEND_TEST_MODEL_URL URL of a model file to download and load. ## BACKEND_TEST_MODEL_FILE Path to an already-downloaded model (skips download). +## BACKEND_TEST_MODEL_NAME HuggingFace repo id (e.g. Qwen/Qwen2.5-0.5B-Instruct). +## Use this instead of MODEL_URL for backends that +## resolve HF model ids natively (vllm, vllm-omni). ## BACKEND_TEST_CAPS Comma-separated capabilities, default "health,load,predict,stream". +## Adds "tools" to exercise ChatDelta tool call extraction. ## BACKEND_TEST_PROMPT Override the prompt used in predict/stream specs. +## BACKEND_TEST_OPTIONS Comma-separated Options[] entries forwarded to LoadModel, +## e.g. "tool_parser:hermes,reasoning_parser:qwen3". ## ## Direct usage (image already built, no docker-build-* dependency): ## @@ -486,9 +492,13 @@ test-extra-backend: protogen-go BACKEND_IMAGE="$$BACKEND_IMAGE" \ BACKEND_TEST_MODEL_URL="$${BACKEND_TEST_MODEL_URL:-$(BACKEND_TEST_MODEL_URL)}" \ BACKEND_TEST_MODEL_FILE="$$BACKEND_TEST_MODEL_FILE" \ + BACKEND_TEST_MODEL_NAME="$$BACKEND_TEST_MODEL_NAME" \ BACKEND_TEST_CAPS="$$BACKEND_TEST_CAPS" \ BACKEND_TEST_PROMPT="$$BACKEND_TEST_PROMPT" \ - go test -v -timeout 15m ./tests/e2e-backends/... + BACKEND_TEST_OPTIONS="$$BACKEND_TEST_OPTIONS" \ + BACKEND_TEST_TOOL_PROMPT="$$BACKEND_TEST_TOOL_PROMPT" \ + BACKEND_TEST_TOOL_NAME="$$BACKEND_TEST_TOOL_NAME" \ + go test -v -timeout 30m ./tests/e2e-backends/... ## Convenience wrappers: build the image, then exercise it. test-extra-backend-llama-cpp: docker-build-llama-cpp @@ -497,6 +507,15 @@ test-extra-backend-llama-cpp: docker-build-llama-cpp test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend +## vllm is resolved from a HuggingFace model id (no file download) and +## exercises Predict + streaming + tool-call extraction via the hermes parser. +test-extra-backend-vllm: docker-build-vllm + BACKEND_IMAGE=local-ai-backend:vllm \ + BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \ + BACKEND_TEST_CAPS=health,load,predict,stream,tools \ + BACKEND_TEST_OPTIONS=tool_parser:hermes \ + $(MAKE) test-extra-backend + DOCKER_IMAGE?=local-ai IMAGE_TYPE?=core BASE_IMAGE?=ubuntu:24.04 diff --git a/backend/python/vllm/test_cpu_inference.py b/backend/python/vllm/test_cpu_inference.py deleted file mode 100644 index ff606b5bf..000000000 --- a/backend/python/vllm/test_cpu_inference.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 -"""End-to-end CPU inference smoke test for the vllm backend. - -Spawns the gRPC backend server, loads a small Qwen model, runs Predict, -TokenizeString, and Free, and verifies non-empty output. - -Usage: - python test_cpu_inference.py [--model MODEL_ID] [--addr HOST:PORT] - -Defaults to Qwen/Qwen2.5-0.5B-Instruct (Qwen3.5-0.6B is not yet published -on the HuggingFace hub at the time of writing). -""" -import argparse -import os -import subprocess -import sys -import time - -import grpc - -# Make sibling backend_pb2 importable -HERE = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(0, HERE) - -import backend_pb2 -import backend_pb2_grpc - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", default=os.environ.get("TEST_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")) - parser.add_argument("--addr", default="127.0.0.1:50099") - parser.add_argument("--prompt", default="Hello, how are you?") - args = parser.parse_args() - - # Force CPU mode for vLLM - env = os.environ.copy() - env.setdefault("VLLM_TARGET_DEVICE", "cpu") - env.setdefault("VLLM_CPU_KVCACHE_SPACE", "4") - - server_proc = subprocess.Popen( - [sys.executable, os.path.join(HERE, "backend.py"), "--addr", args.addr], - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) - - try: - # Wait for the server to come up - deadline = time.time() + 30 - channel = None - while time.time() < deadline: - try: - channel = grpc.insecure_channel(args.addr) - grpc.channel_ready_future(channel).result(timeout=2) - break - except Exception: - time.sleep(0.5) - if channel is None: - raise RuntimeError("backend server did not start in time") - - stub = backend_pb2_grpc.BackendStub(channel) - - print(f"[test] LoadModel({args.model})", flush=True) - load_resp = stub.LoadModel(backend_pb2.ModelOptions( - Model=args.model, - ContextSize=2048, - ), timeout=900) - assert load_resp.success, f"LoadModel failed: {load_resp.message}" - - print(f"[test] Predict prompt={args.prompt!r}", flush=True) - reply = stub.Predict(backend_pb2.PredictOptions( - Prompt=args.prompt, - Tokens=64, - Temperature=0.7, - TopP=0.9, - ), timeout=600) - text = reply.message.decode("utf-8") - print(f"[test] Predict output: {text!r}", flush=True) - assert text.strip(), "Predict returned empty text" - - print("[test] TokenizeString", flush=True) - tok_resp = stub.TokenizeString(backend_pb2.PredictOptions(Prompt="hello world"), timeout=30) - print(f"[test] TokenizeString length={tok_resp.length}", flush=True) - assert tok_resp.length > 0 - - print("[test] Free", flush=True) - free_resp = stub.Free(backend_pb2.MemoryUsageData(), timeout=30) - assert free_resp.success, f"Free failed: {free_resp.message}" - - print("[test] PASS", flush=True) - finally: - server_proc.terminate() - try: - server_proc.wait(timeout=10) - except subprocess.TimeoutExpired: - server_proc.kill() - - -if __name__ == "__main__": - main() diff --git a/backend/python/vllm/test_tool_calls.py b/backend/python/vllm/test_tool_calls.py deleted file mode 100644 index 12b36f6f2..000000000 --- a/backend/python/vllm/test_tool_calls.py +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env python3 -"""End-to-end CPU tool-calling test for the vllm backend. - -Loads Qwen2.5-0.5B-Instruct with the hermes tool parser, sends a chat -completion with a `get_weather` tool, and checks that the reply's -ChatDelta contains a ToolCallDelta for that function. -""" -import argparse -import json -import os -import subprocess -import sys -import time - -import grpc - -HERE = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(0, HERE) - -import backend_pb2 -import backend_pb2_grpc - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", default="Qwen/Qwen2.5-0.5B-Instruct") - parser.add_argument("--addr", default="127.0.0.1:50098") - args = parser.parse_args() - - env = os.environ.copy() - env.setdefault("VLLM_TARGET_DEVICE", "cpu") - env.setdefault("VLLM_CPU_KVCACHE_SPACE", "4") - - server_proc = subprocess.Popen( - [sys.executable, os.path.join(HERE, "backend.py"), "--addr", args.addr], - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) - - try: - deadline = time.time() + 30 - channel = None - while time.time() < deadline: - try: - channel = grpc.insecure_channel(args.addr) - grpc.channel_ready_future(channel).result(timeout=2) - break - except Exception: - time.sleep(0.5) - if channel is None: - raise RuntimeError("backend server did not start in time") - - stub = backend_pb2_grpc.BackendStub(channel) - - print(f"[test] LoadModel({args.model}) with hermes tool_parser", flush=True) - load_resp = stub.LoadModel(backend_pb2.ModelOptions( - Model=args.model, - ContextSize=2048, - Options=["tool_parser:hermes"], - ), timeout=900) - assert load_resp.success, f"LoadModel failed: {load_resp.message}" - - tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", - }, - }, - "required": ["location"], - }, - }, - }] - - messages = [ - backend_pb2.Message(role="system", content="You are a helpful assistant. Use the get_weather tool when the user asks about weather."), - backend_pb2.Message(role="user", content="What's the weather like in Paris, France?"), - ] - - print("[test] Predict with tool definitions", flush=True) - reply = stub.Predict(backend_pb2.PredictOptions( - Messages=messages, - Tools=json.dumps(tools), - ToolChoice="auto", - UseTokenizerTemplate=True, - Tokens=200, - Temperature=0.1, - ), timeout=600) - - text = reply.message.decode("utf-8") - print(f"[test] Raw message: {text!r}", flush=True) - print(f"[test] prompt_tokens={reply.prompt_tokens} tokens={reply.tokens}", flush=True) - print(f"[test] chat_deltas count: {len(reply.chat_deltas)}", flush=True) - - tool_calls_seen = [] - for delta in reply.chat_deltas: - print(f"[test] delta.content={delta.content!r}", flush=True) - print(f"[test] delta.reasoning_content={delta.reasoning_content!r}", flush=True) - for tc in delta.tool_calls: - print(f"[test] tool_call idx={tc.index} id={tc.id!r} name={tc.name!r} args={tc.arguments!r}", flush=True) - tool_calls_seen.append(tc) - - # Verify at least one tool call was extracted - assert len(tool_calls_seen) > 0, ( - "No tool calls in ChatDelta. " - f"Raw text was: {text!r}" - ) - assert any(tc.name == "get_weather" for tc in tool_calls_seen), ( - f"Expected get_weather tool call, got: {[tc.name for tc in tool_calls_seen]}" - ) - - print("[test] Free", flush=True) - stub.Free(backend_pb2.HealthMessage(), timeout=30) - - print("[test] PASS", flush=True) - return 0 - - finally: - try: - server_proc.terminate() - server_proc.wait(timeout=10) - except Exception: - server_proc.kill() - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/tests/e2e-backends/backend_test.go b/tests/e2e-backends/backend_test.go index a800a7ab5..b6f59fd28 100644 --- a/tests/e2e-backends/backend_test.go +++ b/tests/e2e-backends/backend_test.go @@ -29,18 +29,30 @@ import ( // // BACKEND_TEST_MODEL_URL HTTP(S) URL of a model file to download before the test. // BACKEND_TEST_MODEL_FILE Path to an already-available model file (skips download). +// BACKEND_TEST_MODEL_NAME HuggingFace model id (e.g. "Qwen/Qwen2.5-0.5B-Instruct"). +// Passed verbatim as ModelOptions.Model; backends like vllm +// resolve it themselves and no local file is downloaded. // // Optional: // // BACKEND_TEST_CAPS Comma-separated list of capabilities to exercise. -// Supported values: health, load, predict, stream, embeddings. +// Supported values: health, load, predict, stream, +// embeddings, tools. // Defaults to "health,load,predict,stream". // A backend that only does embeddings would set this to // "health,load,embeddings"; an image/TTS backend that cannot // be driven by a text prompt can set it to "health,load". +// "tools" asks the backend to extract a tool call from the +// model output into ChatDelta.tool_calls. // BACKEND_TEST_PROMPT Override the prompt used by predict/stream specs. // BACKEND_TEST_CTX_SIZE Override the context size passed to LoadModel (default 512). // BACKEND_TEST_THREADS Override Threads passed to LoadModel (default 4). +// BACKEND_TEST_OPTIONS Comma-separated Options[] entries passed to LoadModel, +// e.g. "tool_parser:hermes,reasoning_parser:qwen3". +// BACKEND_TEST_TOOL_PROMPT Override the user prompt for the tools spec +// (default: "What's the weather like in Paris, France?"). +// BACKEND_TEST_TOOL_NAME Override the function name expected in the tool call +// (default: "get_weather"). // // The suite is intentionally model-format-agnostic: it only ever passes the // file path to LoadModel, so GGUF, ONNX, safetensors, .bin etc. all work so @@ -51,9 +63,12 @@ const ( capPredict = "predict" capStream = "stream" capEmbeddings = "embeddings" + capTools = "tools" - defaultPrompt = "The capital of France is" - streamPrompt = "Once upon a time" + defaultPrompt = "The capital of France is" + streamPrompt = "Once upon a time" + defaultToolPrompt = "What's the weather like in Paris, France?" + defaultToolName = "get_weather" ) func defaultCaps() map[string]bool { @@ -87,12 +102,14 @@ var _ = Describe("Backend container", Ordered, func() { caps map[string]bool workDir string binaryDir string - modelFile string + modelFile string // set when a local file is used + modelName string // set when a HuggingFace model id is used addr string serverCmd *exec.Cmd conn *grpc.ClientConn client pb.BackendClient prompt string + options []string ) BeforeAll(func() { @@ -101,8 +118,9 @@ var _ = Describe("Backend container", Ordered, func() { modelURL := os.Getenv("BACKEND_TEST_MODEL_URL") modelFile = os.Getenv("BACKEND_TEST_MODEL_FILE") - Expect(modelURL != "" || modelFile != "").To(BeTrue(), - "one of BACKEND_TEST_MODEL_URL or BACKEND_TEST_MODEL_FILE must be set") + modelName = os.Getenv("BACKEND_TEST_MODEL_NAME") + Expect(modelURL != "" || modelFile != "" || modelName != "").To(BeTrue(), + "one of BACKEND_TEST_MODEL_URL, BACKEND_TEST_MODEL_FILE, or BACKEND_TEST_MODEL_NAME must be set") caps = parseCaps() GinkgoWriter.Printf("Testing image=%q with capabilities=%v\n", image, keys(caps)) @@ -112,6 +130,15 @@ var _ = Describe("Backend container", Ordered, func() { prompt = defaultPrompt } + if raw := strings.TrimSpace(os.Getenv("BACKEND_TEST_OPTIONS")); raw != "" { + for _, opt := range strings.Split(raw, ",") { + opt = strings.TrimSpace(opt) + if opt != "" { + options = append(options, opt) + } + } + } + var err error workDir, err = os.MkdirTemp("", "backend-e2e-*") Expect(err).NotTo(HaveOccurred()) @@ -122,8 +149,8 @@ var _ = Describe("Backend container", Ordered, func() { extractImage(image, binaryDir) Expect(filepath.Join(binaryDir, "run.sh")).To(BeAnExistingFile()) - // Download the model once if not provided. - if modelFile == "" { + // Download the model once if not provided and no HF name given. + if modelFile == "" && modelName == "" { modelFile = filepath.Join(workDir, "model.bin") downloadFile(modelURL, modelFile) } @@ -196,16 +223,27 @@ var _ = Describe("Backend container", Ordered, func() { ctxSize := envInt32("BACKEND_TEST_CTX_SIZE", 512) threads := envInt32("BACKEND_TEST_THREADS", 4) - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + // Prefer a HuggingFace model id when provided (e.g. for vllm); + // otherwise fall back to a downloaded/local file path. + modelRef := modelFile + var modelPath string + if modelName != "" { + modelRef = modelName + } else { + modelPath = modelFile + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) defer cancel() res, err := client.LoadModel(ctx, &pb.ModelOptions{ - Model: modelFile, - ModelFile: modelFile, + Model: modelRef, + ModelFile: modelPath, ContextSize: ctxSize, Threads: threads, NGPULayers: 0, MMap: true, NBatch: 128, + Options: options, }) Expect(err).NotTo(HaveOccurred()) Expect(res.GetSuccess()).To(BeTrue(), "LoadModel failed: %s", res.GetMessage()) @@ -275,6 +313,78 @@ var _ = Describe("Backend container", Ordered, func() { Expect(res.GetEmbeddings()).NotTo(BeEmpty(), "Embedding returned empty vector") GinkgoWriter.Printf("Embedding: %d dims\n", len(res.GetEmbeddings())) }) + + It("extracts tool calls into ChatDelta", func() { + if !caps[capTools] { + Skip("tools capability not enabled") + } + + toolPrompt := os.Getenv("BACKEND_TEST_TOOL_PROMPT") + if toolPrompt == "" { + toolPrompt = defaultToolPrompt + } + toolName := os.Getenv("BACKEND_TEST_TOOL_NAME") + if toolName == "" { + toolName = defaultToolName + } + + toolsJSON := fmt.Sprintf(`[{ + "type": "function", + "function": { + "name": %q, + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + } + }, + "required": ["location"] + } + } + }]`, toolName) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + res, err := client.Predict(ctx, &pb.PredictOptions{ + Messages: []*pb.Message{ + {Role: "system", Content: "You are a helpful assistant. Use the provided tool when the user asks about weather."}, + {Role: "user", Content: toolPrompt}, + }, + Tools: toolsJSON, + ToolChoice: "auto", + UseTokenizerTemplate: true, + Tokens: 200, + Temperature: 0.1, + }) + Expect(err).NotTo(HaveOccurred()) + + // Collect tool calls from every delta — some backends emit a single + // final delta, others stream incremental pieces in one Reply. + var toolCalls []*pb.ToolCallDelta + for _, delta := range res.GetChatDeltas() { + toolCalls = append(toolCalls, delta.GetToolCalls()...) + } + + GinkgoWriter.Printf("Tool call: raw=%q deltas=%d tool_calls=%d\n", + string(res.GetMessage()), len(res.GetChatDeltas()), len(toolCalls)) + + Expect(toolCalls).NotTo(BeEmpty(), + "Predict did not return any ToolCallDelta. raw=%q", string(res.GetMessage())) + + matched := false + for _, tc := range toolCalls { + GinkgoWriter.Printf(" - idx=%d id=%q name=%q args=%q\n", + tc.GetIndex(), tc.GetId(), tc.GetName(), tc.GetArguments()) + if tc.GetName() == toolName { + matched = true + } + } + Expect(matched).To(BeTrue(), + "Expected a tool call named %q in ChatDelta.tool_calls", toolName) + }) }) // extractImage runs `docker create` + `docker export` to materialise the image