From e7f406169a765b98463eecb0dad78f3909ba39d2 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sun, 12 Apr 2026 14:51:58 +0000
Subject: [PATCH] test(e2e-backends): add tools capability + HF model name
 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends tests/e2e-backends to cover backends that:
- Resolve HuggingFace model ids natively (vllm, vllm-omni) instead of
  loading a local file: BACKEND_TEST_MODEL_NAME is passed verbatim as
  ModelOptions.Model with no download/ModelFile.
- Parse tool calls into ChatDelta.tool_calls: new "tools" capability
  sends a Predict with a get_weather function definition and asserts
  the Reply contains a matching ToolCallDelta. Uses UseTokenizerTemplate
  with OpenAI-style Messages so the backend can wire tools into the
  model's chat template.
- Need backend-specific Options[]: BACKEND_TEST_OPTIONS lets a test set
  e.g. "tool_parser:hermes,reasoning_parser:qwen3" at LoadModel time.

Adds make target test-extra-backend-vllm that:
- docker-build-vllm
- loads Qwen/Qwen2.5-0.5B-Instruct
- runs health,load,predict,stream,tools with tool_parser:hermes

Drops backend/python/vllm/test_{cpu_inference,tool_calls}.py — those
standalone scripts were scaffolding used while bringing up the Python
backend; the e2e-backends harness now covers the same ground uniformly
alongside llama-cpp and ik-llama-cpp.
---
 Makefile                                  |  21 +++-
 backend/python/vllm/test_cpu_inference.py | 101 ----------------
 backend/python/vllm/test_tool_calls.py    | 134 ----------------------
 tests/e2e-backends/backend_test.go        | 132 +++++++++++++++++++--
 4 files changed, 141 insertions(+), 247 deletions(-)
 delete mode 100644 backend/python/vllm/test_cpu_inference.py
 delete mode 100644 backend/python/vllm/test_tool_calls.py

diff --git a/Makefile b/Makefile
index 6dce83efd..7f61666f5 100644
--- a/Makefile
+++ b/Makefile
@@ -466,8 +466,14 @@ test-extra: prepare-test-extra
 ##   BACKEND_IMAGE            Required. Docker image to test, e.g. local-ai-backend:llama-cpp.
 ##   BACKEND_TEST_MODEL_URL   URL of a model file to download and load.
 ##   BACKEND_TEST_MODEL_FILE  Path to an already-downloaded model (skips download).
+##   BACKEND_TEST_MODEL_NAME  HuggingFace repo id (e.g. Qwen/Qwen2.5-0.5B-Instruct).
+##                            Use this instead of MODEL_URL for backends that
+##                            resolve HF model ids natively (vllm, vllm-omni).
 ##   BACKEND_TEST_CAPS        Comma-separated capabilities, default "health,load,predict,stream".
+##                            Adds "tools" to exercise ChatDelta tool call extraction.
 ##   BACKEND_TEST_PROMPT      Override the prompt used in predict/stream specs.
+##   BACKEND_TEST_OPTIONS     Comma-separated Options[] entries forwarded to LoadModel,
+##                            e.g. "tool_parser:hermes,reasoning_parser:qwen3".
 ##
 ## Direct usage (image already built, no docker-build-* dependency):
 ##
@@ -486,9 +492,13 @@ test-extra-backend: protogen-go
 	BACKEND_IMAGE="$$BACKEND_IMAGE" \
 	BACKEND_TEST_MODEL_URL="$${BACKEND_TEST_MODEL_URL:-$(BACKEND_TEST_MODEL_URL)}" \
 	BACKEND_TEST_MODEL_FILE="$$BACKEND_TEST_MODEL_FILE" \
+	BACKEND_TEST_MODEL_NAME="$$BACKEND_TEST_MODEL_NAME" \
 	BACKEND_TEST_CAPS="$$BACKEND_TEST_CAPS" \
 	BACKEND_TEST_PROMPT="$$BACKEND_TEST_PROMPT" \
-	go test -v -timeout 15m ./tests/e2e-backends/...
+	BACKEND_TEST_OPTIONS="$$BACKEND_TEST_OPTIONS" \
+	BACKEND_TEST_TOOL_PROMPT="$$BACKEND_TEST_TOOL_PROMPT" \
+	BACKEND_TEST_TOOL_NAME="$$BACKEND_TEST_TOOL_NAME" \
+	go test -v -timeout 30m ./tests/e2e-backends/...
 
 ## Convenience wrappers: build the image, then exercise it.
 test-extra-backend-llama-cpp: docker-build-llama-cpp
@@ -497,6 +507,15 @@ test-extra-backend-llama-cpp: docker-build-llama-cpp
 test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp
 	BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend
 
+## vllm is resolved from a HuggingFace model id (no file download) and
+## exercises Predict + streaming + tool-call extraction via the hermes parser.
+test-extra-backend-vllm: docker-build-vllm
+	BACKEND_IMAGE=local-ai-backend:vllm \
+	BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \
+	BACKEND_TEST_CAPS=health,load,predict,stream,tools \
+	BACKEND_TEST_OPTIONS=tool_parser:hermes \
+	$(MAKE) test-extra-backend
+
 DOCKER_IMAGE?=local-ai
 IMAGE_TYPE?=core
 BASE_IMAGE?=ubuntu:24.04
diff --git a/backend/python/vllm/test_cpu_inference.py b/backend/python/vllm/test_cpu_inference.py
deleted file mode 100644
index ff606b5bf..000000000
--- a/backend/python/vllm/test_cpu_inference.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-"""End-to-end CPU inference smoke test for the vllm backend.
-
-Spawns the gRPC backend server, loads a small Qwen model, runs Predict,
-TokenizeString, and Free, and verifies non-empty output.
-
-Usage:
-    python test_cpu_inference.py [--model MODEL_ID] [--addr HOST:PORT]
-
-Defaults to Qwen/Qwen2.5-0.5B-Instruct (Qwen3.5-0.6B is not yet published
-on the HuggingFace hub at the time of writing).
-"""
-import argparse
-import os
-import subprocess
-import sys
-import time
-
-import grpc
-
-# Make sibling backend_pb2 importable
-HERE = os.path.dirname(os.path.abspath(__file__))
-sys.path.insert(0, HERE)
-
-import backend_pb2
-import backend_pb2_grpc
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default=os.environ.get("TEST_MODEL", "Qwen/Qwen2.5-0.5B-Instruct"))
-    parser.add_argument("--addr", default="127.0.0.1:50099")
-    parser.add_argument("--prompt", default="Hello, how are you?")
-    args = parser.parse_args()
-
-    # Force CPU mode for vLLM
-    env = os.environ.copy()
-    env.setdefault("VLLM_TARGET_DEVICE", "cpu")
-    env.setdefault("VLLM_CPU_KVCACHE_SPACE", "4")
-
-    server_proc = subprocess.Popen(
-        [sys.executable, os.path.join(HERE, "backend.py"), "--addr", args.addr],
-        env=env,
-        stdout=sys.stdout,
-        stderr=sys.stderr,
-    )
-
-    try:
-        # Wait for the server to come up
-        deadline = time.time() + 30
-        channel = None
-        while time.time() < deadline:
-            try:
-                channel = grpc.insecure_channel(args.addr)
-                grpc.channel_ready_future(channel).result(timeout=2)
-                break
-            except Exception:
-                time.sleep(0.5)
-        if channel is None:
-            raise RuntimeError("backend server did not start in time")
-
-        stub = backend_pb2_grpc.BackendStub(channel)
-
-        print(f"[test] LoadModel({args.model})", flush=True)
-        load_resp = stub.LoadModel(backend_pb2.ModelOptions(
-            Model=args.model,
-            ContextSize=2048,
-        ), timeout=900)
-        assert load_resp.success, f"LoadModel failed: {load_resp.message}"
-
-        print(f"[test] Predict prompt={args.prompt!r}", flush=True)
-        reply = stub.Predict(backend_pb2.PredictOptions(
-            Prompt=args.prompt,
-            Tokens=64,
-            Temperature=0.7,
-            TopP=0.9,
-        ), timeout=600)
-        text = reply.message.decode("utf-8")
-        print(f"[test] Predict output: {text!r}", flush=True)
-        assert text.strip(), "Predict returned empty text"
-
-        print("[test] TokenizeString", flush=True)
-        tok_resp = stub.TokenizeString(backend_pb2.PredictOptions(Prompt="hello world"), timeout=30)
-        print(f"[test] TokenizeString length={tok_resp.length}", flush=True)
-        assert tok_resp.length > 0
-
-        print("[test] Free", flush=True)
-        free_resp = stub.Free(backend_pb2.MemoryUsageData(), timeout=30)
-        assert free_resp.success, f"Free failed: {free_resp.message}"
-
-        print("[test] PASS", flush=True)
-    finally:
-        server_proc.terminate()
-        try:
-            server_proc.wait(timeout=10)
-        except subprocess.TimeoutExpired:
-            server_proc.kill()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/backend/python/vllm/test_tool_calls.py b/backend/python/vllm/test_tool_calls.py
deleted file mode 100644
index 12b36f6f2..000000000
--- a/backend/python/vllm/test_tool_calls.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/usr/bin/env python3
-"""End-to-end CPU tool-calling test for the vllm backend.
-
-Loads Qwen2.5-0.5B-Instruct with the hermes tool parser, sends a chat
-completion with a `get_weather` tool, and checks that the reply's
-ChatDelta contains a ToolCallDelta for that function.
-"""
-import argparse
-import json
-import os
-import subprocess
-import sys
-import time
-
-import grpc
-
-HERE = os.path.dirname(os.path.abspath(__file__))
-sys.path.insert(0, HERE)
-
-import backend_pb2
-import backend_pb2_grpc
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="Qwen/Qwen2.5-0.5B-Instruct")
-    parser.add_argument("--addr", default="127.0.0.1:50098")
-    args = parser.parse_args()
-
-    env = os.environ.copy()
-    env.setdefault("VLLM_TARGET_DEVICE", "cpu")
-    env.setdefault("VLLM_CPU_KVCACHE_SPACE", "4")
-
-    server_proc = subprocess.Popen(
-        [sys.executable, os.path.join(HERE, "backend.py"), "--addr", args.addr],
-        env=env,
-        stdout=sys.stdout,
-        stderr=sys.stderr,
-    )
-
-    try:
-        deadline = time.time() + 30
-        channel = None
-        while time.time() < deadline:
-            try:
-                channel = grpc.insecure_channel(args.addr)
-                grpc.channel_ready_future(channel).result(timeout=2)
-                break
-            except Exception:
-                time.sleep(0.5)
-        if channel is None:
-            raise RuntimeError("backend server did not start in time")
-
-        stub = backend_pb2_grpc.BackendStub(channel)
-
-        print(f"[test] LoadModel({args.model}) with hermes tool_parser", flush=True)
-        load_resp = stub.LoadModel(backend_pb2.ModelOptions(
-            Model=args.model,
-            ContextSize=2048,
-            Options=["tool_parser:hermes"],
-        ), timeout=900)
-        assert load_resp.success, f"LoadModel failed: {load_resp.message}"
-
-        tools = [{
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {
-                            "type": "string",
-                            "description": "The city and state, e.g. San Francisco, CA",
-                        },
-                    },
-                    "required": ["location"],
-                },
-            },
-        }]
-
-        messages = [
-            backend_pb2.Message(role="system", content="You are a helpful assistant. Use the get_weather tool when the user asks about weather."),
-            backend_pb2.Message(role="user", content="What's the weather like in Paris, France?"),
-        ]
-
-        print("[test] Predict with tool definitions", flush=True)
-        reply = stub.Predict(backend_pb2.PredictOptions(
-            Messages=messages,
-            Tools=json.dumps(tools),
-            ToolChoice="auto",
-            UseTokenizerTemplate=True,
-            Tokens=200,
-            Temperature=0.1,
-        ), timeout=600)
-
-        text = reply.message.decode("utf-8")
-        print(f"[test] Raw message: {text!r}", flush=True)
-        print(f"[test] prompt_tokens={reply.prompt_tokens} tokens={reply.tokens}", flush=True)
-        print(f"[test] chat_deltas count: {len(reply.chat_deltas)}", flush=True)
-
-        tool_calls_seen = []
-        for delta in reply.chat_deltas:
-            print(f"[test] delta.content={delta.content!r}", flush=True)
-            print(f"[test] delta.reasoning_content={delta.reasoning_content!r}", flush=True)
-            for tc in delta.tool_calls:
-                print(f"[test] tool_call idx={tc.index} id={tc.id!r} name={tc.name!r} args={tc.arguments!r}", flush=True)
-                tool_calls_seen.append(tc)
-
-        # Verify at least one tool call was extracted
-        assert len(tool_calls_seen) > 0, (
-            "No tool calls in ChatDelta. "
-            f"Raw text was: {text!r}"
-        )
-        assert any(tc.name == "get_weather" for tc in tool_calls_seen), (
-            f"Expected get_weather tool call, got: {[tc.name for tc in tool_calls_seen]}"
-        )
-
-        print("[test] Free", flush=True)
-        stub.Free(backend_pb2.HealthMessage(), timeout=30)
-
-        print("[test] PASS", flush=True)
-        return 0
-
-    finally:
-        try:
-            server_proc.terminate()
-            server_proc.wait(timeout=10)
-        except Exception:
-            server_proc.kill()
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/tests/e2e-backends/backend_test.go b/tests/e2e-backends/backend_test.go
index a800a7ab5..b6f59fd28 100644
--- a/tests/e2e-backends/backend_test.go
+++ b/tests/e2e-backends/backend_test.go
@@ -29,18 +29,30 @@ import (
 //
 //	BACKEND_TEST_MODEL_URL   HTTP(S) URL of a model file to download before the test.
 //	BACKEND_TEST_MODEL_FILE  Path to an already-available model file (skips download).
+//	BACKEND_TEST_MODEL_NAME  HuggingFace model id (e.g. "Qwen/Qwen2.5-0.5B-Instruct").
+//	                         Passed verbatim as ModelOptions.Model; backends like vllm
+//	                         resolve it themselves and no local file is downloaded.
 //
 // Optional:
 //
 //	BACKEND_TEST_CAPS        Comma-separated list of capabilities to exercise.
-//	                         Supported values: health, load, predict, stream, embeddings.
+//	                         Supported values: health, load, predict, stream,
+//	                         embeddings, tools.
 //	                         Defaults to "health,load,predict,stream".
 //	                         A backend that only does embeddings would set this to
 //	                         "health,load,embeddings"; an image/TTS backend that cannot
 //	                         be driven by a text prompt can set it to "health,load".
+//	                         "tools" asks the backend to extract a tool call from the
+//	                         model output into ChatDelta.tool_calls.
 //	BACKEND_TEST_PROMPT      Override the prompt used by predict/stream specs.
 //	BACKEND_TEST_CTX_SIZE    Override the context size passed to LoadModel (default 512).
 //	BACKEND_TEST_THREADS     Override Threads passed to LoadModel (default 4).
+//	BACKEND_TEST_OPTIONS     Comma-separated Options[] entries passed to LoadModel,
+//	                         e.g. "tool_parser:hermes,reasoning_parser:qwen3".
+//	BACKEND_TEST_TOOL_PROMPT Override the user prompt for the tools spec
+//	                         (default: "What's the weather like in Paris, France?").
+//	BACKEND_TEST_TOOL_NAME   Override the function name expected in the tool call
+//	                         (default: "get_weather").
 //
 // The suite is intentionally model-format-agnostic: it only ever passes the
 // file path to LoadModel, so GGUF, ONNX, safetensors, .bin etc. all work so
@@ -51,9 +63,12 @@ const (
 	capPredict    = "predict"
 	capStream     = "stream"
 	capEmbeddings = "embeddings"
+	capTools      = "tools"
 
-	defaultPrompt = "The capital of France is"
-	streamPrompt  = "Once upon a time"
+	defaultPrompt     = "The capital of France is"
+	streamPrompt      = "Once upon a time"
+	defaultToolPrompt = "What's the weather like in Paris, France?"
+	defaultToolName   = "get_weather"
 )
 
 func defaultCaps() map[string]bool {
@@ -87,12 +102,14 @@ var _ = Describe("Backend container", Ordered, func() {
 		caps      map[string]bool
 		workDir   string
 		binaryDir string
-		modelFile string
+		modelFile string // set when a local file is used
+		modelName string // set when a HuggingFace model id is used
 		addr      string
 		serverCmd *exec.Cmd
 		conn      *grpc.ClientConn
 		client    pb.BackendClient
 		prompt    string
+		options   []string
 	)
 
 	BeforeAll(func() {
@@ -101,8 +118,9 @@ var _ = Describe("Backend container", Ordered, func() {
 
 		modelURL := os.Getenv("BACKEND_TEST_MODEL_URL")
 		modelFile = os.Getenv("BACKEND_TEST_MODEL_FILE")
-		Expect(modelURL != "" || modelFile != "").To(BeTrue(),
-			"one of BACKEND_TEST_MODEL_URL or BACKEND_TEST_MODEL_FILE must be set")
+		modelName = os.Getenv("BACKEND_TEST_MODEL_NAME")
+		Expect(modelURL != "" || modelFile != "" || modelName != "").To(BeTrue(),
+			"one of BACKEND_TEST_MODEL_URL, BACKEND_TEST_MODEL_FILE, or BACKEND_TEST_MODEL_NAME must be set")
 
 		caps = parseCaps()
 		GinkgoWriter.Printf("Testing image=%q with capabilities=%v\n", image, keys(caps))
@@ -112,6 +130,15 @@ var _ = Describe("Backend container", Ordered, func() {
 			prompt = defaultPrompt
 		}
 
+		if raw := strings.TrimSpace(os.Getenv("BACKEND_TEST_OPTIONS")); raw != "" {
+			for _, opt := range strings.Split(raw, ",") {
+				opt = strings.TrimSpace(opt)
+				if opt != "" {
+					options = append(options, opt)
+				}
+			}
+		}
+
 		var err error
 		workDir, err = os.MkdirTemp("", "backend-e2e-*")
 		Expect(err).NotTo(HaveOccurred())
@@ -122,8 +149,8 @@ var _ = Describe("Backend container", Ordered, func() {
 		extractImage(image, binaryDir)
 		Expect(filepath.Join(binaryDir, "run.sh")).To(BeAnExistingFile())
 
-		// Download the model once if not provided.
-		if modelFile == "" {
+		// Download the model once if not provided and no HF name given.
+		if modelFile == "" && modelName == "" {
 			modelFile = filepath.Join(workDir, "model.bin")
 			downloadFile(modelURL, modelFile)
 		}
@@ -196,16 +223,27 @@ var _ = Describe("Backend container", Ordered, func() {
 		ctxSize := envInt32("BACKEND_TEST_CTX_SIZE", 512)
 		threads := envInt32("BACKEND_TEST_THREADS", 4)
 
-		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+		// Prefer a HuggingFace model id when provided (e.g. for vllm);
+		// otherwise fall back to a downloaded/local file path.
+		modelRef := modelFile
+		var modelPath string
+		if modelName != "" {
+			modelRef = modelName
+		} else {
+			modelPath = modelFile
+		}
+
+		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
 		defer cancel()
 		res, err := client.LoadModel(ctx, &pb.ModelOptions{
-			Model:       modelFile,
-			ModelFile:   modelFile,
+			Model:       modelRef,
+			ModelFile:   modelPath,
 			ContextSize: ctxSize,
 			Threads:     threads,
 			NGPULayers:  0,
 			MMap:        true,
 			NBatch:      128,
+			Options:     options,
 		})
 		Expect(err).NotTo(HaveOccurred())
 		Expect(res.GetSuccess()).To(BeTrue(), "LoadModel failed: %s", res.GetMessage())
@@ -275,6 +313,78 @@ var _ = Describe("Backend container", Ordered, func() {
 		Expect(res.GetEmbeddings()).NotTo(BeEmpty(), "Embedding returned empty vector")
 		GinkgoWriter.Printf("Embedding: %d dims\n", len(res.GetEmbeddings()))
 	})
+
+	It("extracts tool calls into ChatDelta", func() {
+		if !caps[capTools] {
+			Skip("tools capability not enabled")
+		}
+
+		toolPrompt := os.Getenv("BACKEND_TEST_TOOL_PROMPT")
+		if toolPrompt == "" {
+			toolPrompt = defaultToolPrompt
+		}
+		toolName := os.Getenv("BACKEND_TEST_TOOL_NAME")
+		if toolName == "" {
+			toolName = defaultToolName
+		}
+
+		toolsJSON := fmt.Sprintf(`[{
+			"type": "function",
+			"function": {
+				"name": %q,
+				"description": "Get the current weather for a location",
+				"parameters": {
+					"type": "object",
+					"properties": {
+						"location": {
+							"type": "string",
+							"description": "The city and state, e.g. San Francisco, CA"
+						}
+					},
+					"required": ["location"]
+				}
+			}
+		}]`, toolName)
+
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+		defer cancel()
+		res, err := client.Predict(ctx, &pb.PredictOptions{
+			Messages: []*pb.Message{
+				{Role: "system", Content: "You are a helpful assistant. Use the provided tool when the user asks about weather."},
+				{Role: "user", Content: toolPrompt},
+			},
+			Tools:                toolsJSON,
+			ToolChoice:           "auto",
+			UseTokenizerTemplate: true,
+			Tokens:               200,
+			Temperature:          0.1,
+		})
+		Expect(err).NotTo(HaveOccurred())
+
+		// Collect tool calls from every delta — some backends emit a single
+		// final delta, others stream incremental pieces in one Reply.
+		var toolCalls []*pb.ToolCallDelta
+		for _, delta := range res.GetChatDeltas() {
+			toolCalls = append(toolCalls, delta.GetToolCalls()...)
+		}
+
+		GinkgoWriter.Printf("Tool call: raw=%q deltas=%d tool_calls=%d\n",
+			string(res.GetMessage()), len(res.GetChatDeltas()), len(toolCalls))
+
+		Expect(toolCalls).NotTo(BeEmpty(),
+			"Predict did not return any ToolCallDelta. raw=%q", string(res.GetMessage()))
+
+		matched := false
+		for _, tc := range toolCalls {
+			GinkgoWriter.Printf("  - idx=%d id=%q name=%q args=%q\n",
+				tc.GetIndex(), tc.GetId(), tc.GetName(), tc.GetArguments())
+			if tc.GetName() == toolName {
+				matched = true
+			}
+		}
+		Expect(matched).To(BeTrue(),
+			"Expected a tool call named %q in ChatDelta.tool_calls", toolName)
+	})
 })
 
 // extractImage runs `docker create` + `docker export` to materialise the image