diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index d89ee06bf..12dcc85f1 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -53,6 +53,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.python"
             context: "./"
             ubuntu-version: '2204'
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-cpu-vllm'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'true'
+            backend: "vllm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: ''
             cuda-major-version: ""
             cuda-minor-version: ""
diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml
index 6b590d156..afeebea82 100644
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -31,6 +31,7 @@ jobs:
       llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }}
       llama-cpp: ${{ steps.detect.outputs.llama-cpp }}
       ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
+      vllm: ${{ steps.detect.outputs.vllm }}
       acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
       qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }}
       voxtral: ${{ steps.detect.outputs.voxtral }}
@@ -501,6 +502,52 @@ jobs:
       - name: Build ik-llama-cpp backend image and run gRPC e2e tests
         run: |
           make test-extra-backend-ik-llama-cpp
+  # tests-vllm-grpc is currently disabled in CI.
+  #
+  # The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16
+  # instructions, and neither ubuntu-latest nor the bigger-runner pool
+  # offers a stable CPU baseline that supports them — runners come
+  # back with different hardware between runs and SIGILL on import of
+  # vllm.model_executor.models.registry. Compiling vllm from source
+  # via FROM_SOURCE=true works on any CPU but takes 30-50 minutes per
+  # run, which is too slow for a smoke test.
+  #
+  # The test itself (tests/e2e-backends + make test-extra-backend-vllm)
+  # is fully working and validated locally on a host with the right
+  # SIMD baseline. Run it manually with:
+  #
+  #   make test-extra-backend-vllm
+  #
+  # Re-enable this job once we have a self-hosted runner label with
+  # guaranteed AVX-512 VNNI/BF16 support, or once the vllm project
+  # publishes a CPU wheel with a wider baseline.
+  #
+  # tests-vllm-grpc:
+  #   needs: detect-changes
+  #   if: needs.detect-changes.outputs.vllm == 'true' || needs.detect-changes.outputs.run-all == 'true'
+  #   runs-on: bigger-runner
+  #   timeout-minutes: 90
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v6
+  #       with:
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install -y --no-install-recommends \
+  #             make build-essential curl unzip ca-certificates git tar
+  #     - name: Setup Go
+  #       uses: actions/setup-go@v5
+  #       with:
+  #         go-version: '1.25.4'
+  #     - name: Free disk space
+  #       run: |
+  #         sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
+  #         df -h
+  #     - name: Build vllm (cpu) backend image and run gRPC e2e tests
+  #       run: |
+  #         make test-extra-backend-vllm
   tests-acestep-cpp:
     needs: detect-changes
     if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
diff --git a/Makefile b/Makefile
index 6dce83efd..7e2e35052 100644
--- a/Makefile
+++ b/Makefile
@@ -466,8 +466,14 @@ test-extra: prepare-test-extra
 ##   BACKEND_IMAGE            Required. Docker image to test, e.g. local-ai-backend:llama-cpp.
 ##   BACKEND_TEST_MODEL_URL   URL of a model file to download and load.
 ##   BACKEND_TEST_MODEL_FILE  Path to an already-downloaded model (skips download).
+##   BACKEND_TEST_MODEL_NAME  HuggingFace repo id (e.g. Qwen/Qwen2.5-0.5B-Instruct).
+##                            Use this instead of MODEL_URL for backends that
+##                            resolve HF model ids natively (vllm, vllm-omni).
 ##   BACKEND_TEST_CAPS        Comma-separated capabilities, default "health,load,predict,stream".
+##                            Adds "tools" to exercise ChatDelta tool call extraction.
 ##   BACKEND_TEST_PROMPT      Override the prompt used in predict/stream specs.
+##   BACKEND_TEST_OPTIONS     Comma-separated Options[] entries forwarded to LoadModel,
+##                            e.g. "tool_parser:hermes,reasoning_parser:qwen3".
 ##
 ## Direct usage (image already built, no docker-build-* dependency):
 ##
@@ -486,9 +492,13 @@ test-extra-backend: protogen-go
 	BACKEND_IMAGE="$$BACKEND_IMAGE" \
 	BACKEND_TEST_MODEL_URL="$${BACKEND_TEST_MODEL_URL:-$(BACKEND_TEST_MODEL_URL)}" \
 	BACKEND_TEST_MODEL_FILE="$$BACKEND_TEST_MODEL_FILE" \
+	BACKEND_TEST_MODEL_NAME="$$BACKEND_TEST_MODEL_NAME" \
 	BACKEND_TEST_CAPS="$$BACKEND_TEST_CAPS" \
 	BACKEND_TEST_PROMPT="$$BACKEND_TEST_PROMPT" \
-	go test -v -timeout 15m ./tests/e2e-backends/...
+	BACKEND_TEST_OPTIONS="$$BACKEND_TEST_OPTIONS" \
+	BACKEND_TEST_TOOL_PROMPT="$$BACKEND_TEST_TOOL_PROMPT" \
+	BACKEND_TEST_TOOL_NAME="$$BACKEND_TEST_TOOL_NAME" \
+	go test -v -timeout 30m ./tests/e2e-backends/...
 
 ## Convenience wrappers: build the image, then exercise it.
 test-extra-backend-llama-cpp: docker-build-llama-cpp
@@ -497,6 +507,18 @@ test-extra-backend-llama-cpp: docker-build-llama-cpp
 test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp
 	BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend
 
+## vllm is resolved from a HuggingFace model id (no file download) and
+## exercises Predict + streaming + tool-call extraction via the hermes parser.
+## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
+## wheel was compiled against (AVX-512 VNNI/BF16); older CPUs will SIGILL
+## on import — on CI this means using the bigger-runner label.
+test-extra-backend-vllm: docker-build-vllm
+	BACKEND_IMAGE=local-ai-backend:vllm \
+	BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \
+	BACKEND_TEST_CAPS=health,load,predict,stream,tools \
+	BACKEND_TEST_OPTIONS=tool_parser:hermes \
+	$(MAKE) test-extra-backend
+
 DOCKER_IMAGE?=local-ai
 IMAGE_TYPE?=core
 BASE_IMAGE?=ubuntu:24.04
@@ -650,6 +672,7 @@ define docker-build-backend
 		--build-arg CUDA_MINOR_VERSION=$(CUDA_MINOR_VERSION) \
 		--build-arg UBUNTU_VERSION=$(UBUNTU_VERSION) \
 		--build-arg UBUNTU_CODENAME=$(UBUNTU_CODENAME) \
+		$(if $(FROM_SOURCE),--build-arg FROM_SOURCE=$(FROM_SOURCE)) \
 		$(if $(filter true,$(5)),--build-arg BACKEND=$(1)) \
 		-t local-ai-backend:$(1) -f backend/Dockerfile.$(2) $(3)
 endef
diff --git a/backend/Dockerfile.python b/backend/Dockerfile.python
index 5d2e6171e..f3bcf8d34 100644
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -29,6 +29,7 @@ RUN apt-get update && \
         curl python3-pip \
         python-is-python3 \
         python3-dev llvm \
+        libnuma1 libgomp1 \
         python3-venv make cmake && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
@@ -195,6 +196,12 @@ COPY backend/backend.proto /${BACKEND}/backend.proto
 COPY backend/python/common/ /${BACKEND}/common
 COPY scripts/build/package-gpu-libs.sh /package-gpu-libs.sh
 
+# Optional per-backend source build toggle (e.g. vllm on CPU can set
+# FROM_SOURCE=true to compile against the build host SIMD instead of
+# pulling a prebuilt wheel). Default empty — most backends ignore it.
+ARG FROM_SOURCE=""
+ENV FROM_SOURCE=${FROM_SOURCE}
+
 RUN cd /${BACKEND} && PORTABLE_PYTHON=true make
 
 # Package GPU libraries into the backend's lib directory
diff --git a/backend/index.yaml b/backend/index.yaml
index a1f5688a8..d0f75a4ca 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -197,6 +197,7 @@
     amd: "rocm-vllm"
     intel: "intel-vllm"
     nvidia-cuda-12: "cuda12-vllm"
+    cpu: "cpu-vllm"
 - &vllm-omni
   name: "vllm-omni"
   license: apache-2.0
@@ -1563,6 +1564,7 @@
     nvidia: "cuda12-vllm-development"
     amd: "rocm-vllm-development"
     intel: "intel-vllm-development"
+    cpu: "cpu-vllm-development"
 - !!merge <<: *vllm
   name: "cuda12-vllm"
   uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm"
@@ -1578,6 +1580,11 @@
   uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-vllm"
   mirrors:
     - localai/localai-backends:latest-gpu-intel-vllm
+- !!merge <<: *vllm
+  name: "cpu-vllm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-vllm"
+  mirrors:
+    - localai/localai-backends:latest-cpu-vllm
 - !!merge <<: *vllm
   name: "cuda12-vllm-development"
   uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-vllm"
@@ -1593,6 +1600,11 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-vllm"
   mirrors:
     - localai/localai-backends:master-gpu-intel-vllm
+- !!merge <<: *vllm
+  name: "cpu-vllm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-vllm"
+  mirrors:
+    - localai/localai-backends:master-cpu-vllm
 # vllm-omni
 - !!merge <<: *vllm-omni
   name: "vllm-omni-development"
diff --git a/backend/python/common/vllm_utils.py b/backend/python/common/vllm_utils.py
new file mode 100644
index 000000000..bc0518663
--- /dev/null
+++ b/backend/python/common/vllm_utils.py
@@ -0,0 +1,84 @@
+"""Shared utilities for vLLM-based backends."""
+import json
+import sys
+
+
+def parse_options(options_list):
+    """Parse Options[] list of 'key:value' strings into a dict.
+
+    Supports type inference for common cases (bool, int, float).
+    Used by LoadModel to extract backend-specific options.
+    """
+    opts = {}
+    for opt in options_list:
+        if ":" not in opt:
+            continue
+        key, value = opt.split(":", 1)
+        key = key.strip()
+        value = value.strip()
+        # Try type conversion
+        if value.lower() in ("true", "false"):
+            opts[key] = value.lower() == "true"
+        else:
+            try:
+                opts[key] = int(value)
+            except ValueError:
+                try:
+                    opts[key] = float(value)
+                except ValueError:
+                    opts[key] = value
+    return opts
+
+
+def messages_to_dicts(proto_messages):
+    """Convert proto Message objects to list of dicts for apply_chat_template().
+
+    Handles: role, content, name, tool_call_id, reasoning_content, tool_calls (JSON string -> list).
+    """
+    result = []
+    for msg in proto_messages:
+        d = {"role": msg.role, "content": msg.content or ""}
+        if msg.name:
+            d["name"] = msg.name
+        if msg.tool_call_id:
+            d["tool_call_id"] = msg.tool_call_id
+        if msg.reasoning_content:
+            d["reasoning_content"] = msg.reasoning_content
+        if msg.tool_calls:
+            try:
+                d["tool_calls"] = json.loads(msg.tool_calls)
+            except json.JSONDecodeError:
+                pass
+        result.append(d)
+    return result
+
+
+def setup_parsers(opts):
+    """Return (tool_parser_cls, reasoning_parser_cls) tuple from opts dict.
+
+    Uses vLLM's native ToolParserManager and ReasoningParserManager.
+    Returns (None, None) if vLLM is not installed or parsers not available.
+    """
+    tool_parser_cls = None
+    reasoning_parser_cls = None
+
+    tool_parser_name = opts.get("tool_parser")
+    reasoning_parser_name = opts.get("reasoning_parser")
+
+    if tool_parser_name:
+        try:
+            from vllm.tool_parsers import ToolParserManager
+            tool_parser_cls = ToolParserManager.get_tool_parser(tool_parser_name)
+            print(f"[vllm_utils] Loaded tool_parser: {tool_parser_name}", file=sys.stderr)
+        except Exception as e:
+            print(f"[vllm_utils] Failed to load tool_parser {tool_parser_name}: {e}", file=sys.stderr)
+
+    if reasoning_parser_name:
+        try:
+            from vllm.reasoning import ReasoningParserManager
+            reasoning_parser_cls = ReasoningParserManager.get_reasoning_parser(reasoning_parser_name)
+            print(f"[vllm_utils] Loaded reasoning_parser: {reasoning_parser_name}", file=sys.stderr)
+        except Exception as e:
+            print(f"[vllm_utils] Failed to load reasoning_parser {reasoning_parser_name}: {e}", file=sys.stderr)
+
+    return tool_parser_cls, reasoning_parser_cls
diff --git a/backend/python/vllm-omni/backend.py b/backend/python/vllm-omni/backend.py
index 96eb8a111..646af2a2e 100644
--- a/backend/python/vllm-omni/backend.py
+++ b/backend/python/vllm-omni/backend.py
@@ -17,6 +17,8 @@ import time
 import os
 import base64
 import io
+import json
+import gc
 
 from PIL import Image
 import torch
@@ -30,6 +32,7 @@ import grpc
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
 from grpc_auth import get_auth_interceptors
+from vllm_utils import parse_options, messages_to_dicts, setup_parsers
 
 
 from vllm_omni.entrypoints.omni import Omni
@@ -148,23 +151,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
     def LoadModel(self, request, context):
         try:
+            # CPU detection: if no CUDA, default vLLM target device to CPU.
+            try:
+                if not torch.cuda.is_available():
+                    os.environ.setdefault("VLLM_TARGET_DEVICE", "cpu")
+                    os.environ.setdefault("VLLM_CPU_KVCACHE_SPACE", "4")
+            except Exception:
+                pass
+
             print(f"Loading model {request.Model}...", file=sys.stderr)
             print(f"Request {request}", file=sys.stderr)
 
-            # Parse options from request.Options (key:value pairs)
-            self.options = {}
-            for opt in request.Options:
-                if ":" not in opt:
-                    continue
-                key, value = opt.split(":", 1)
-                # Convert value to appropriate type
-                if is_float(value):
-                    value = float(value)
-                elif is_int(value):
-                    value = int(value)
-                elif value.lower() in ["true", "false"]:
-                    value = value.lower() == "true"
-                self.options[key] = value
+            # Parse options from request.Options using shared helper
+            self.options = parse_options(request.Options)
+            opts = self.options
 
             print(f"Options: {self.options}", file=sys.stderr)
 
@@ -244,6 +244,24 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                     omni_kwargs["max_model_len"] = request.MaxModelLen
 
             self.omni = Omni(**omni_kwargs)
+
+            # Load tokenizer for LLM/TTS so chat templates work
+            if self.model_type in ("llm", "tts"):
+                try:
+                    from vllm.transformers_utils.tokenizer import get_tokenizer
+                    self.tokenizer = get_tokenizer(
+                        request.Model,
+                        trust_remote_code=opts.get("trust_remote_code", False),
+                    )
+                except Exception as e:
+                    print(f"Failed to load tokenizer: {e}", file=sys.stderr)
+                    self.tokenizer = None
+            else:
+                self.tokenizer = None
+
+            # Setup optional tool / reasoning parsers
+            self.tool_parser_cls, self.reasoning_parser_cls = setup_parsers(opts)
+
             print("Model loaded successfully", file=sys.stderr)
             return backend_pb2.Result(message="Model loaded successfully", success=True)
 
@@ -466,14 +484,32 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             # Extract prompt
             if request.Prompt:
                 prompt = request.Prompt
-            elif request.Messages and request.UseTokenizerTemplate:
-                # Build prompt from messages (simplified - would need tokenizer for full template)
-                prompt = ""
-                for msg in request.Messages:
-                    role = msg.role
-                    content = msg.content
-                    prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
-                prompt += "<|im_start|>assistant\n"
+            elif request.Messages:
+                if getattr(self, "tokenizer", None) is not None:
+                    messages_dicts = messages_to_dicts(request.Messages)
+                    template_kwargs = {"tokenize": False, "add_generation_prompt": True}
+                    if request.Tools:
+                        try:
+                            template_kwargs["tools"] = json.loads(request.Tools)
+                        except json.JSONDecodeError:
+                            pass
+                    try:
+                        if request.Metadata.get("enable_thinking", "").lower() == "true":
+                            template_kwargs["enable_thinking"] = True
+                    except Exception:
+                        pass
+                    try:
+                        prompt = self.tokenizer.apply_chat_template(messages_dicts, **template_kwargs)
+                    except TypeError:
+                        prompt = self.tokenizer.apply_chat_template(
+                            messages_dicts, tokenize=False, add_generation_prompt=True
+                        )
+                else:
+                    # Fallback: basic template
+                    prompt = ""
+                    for msg in request.Messages:
+                        prompt += f"<|im_start|>{msg.role}\n{msg.content}<|im_end|>\n"
+                    prompt += "<|im_start|>assistant\n"
             else:
                 yield backend_pb2.Reply(message=bytes("", 'utf-8'))
                 return
@@ -539,20 +575,79 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             # Call omni.generate() (returns generator for LLM mode)
             omni_generator = self.omni.generate([inputs], sampling_params_list)
 
-            # Extract text from outputs
+            # Extract text from outputs and track token usage
             generated_text = ""
+            prompt_tokens = 0
+            completion_tokens = 0
             for stage_outputs in omni_generator:
                 if stage_outputs.final_output_type == "text":
                     for output in stage_outputs.request_output:
-                        text_output = output.outputs[0].text
+                        completion = output.outputs[0]
+                        text_output = completion.text
+                        # Track tokens when available
+                        try:
+                            if getattr(output, "prompt_token_ids", None) is not None:
+                                prompt_tokens = len(output.prompt_token_ids)
+                            if getattr(completion, "token_ids", None) is not None:
+                                completion_tokens = len(completion.token_ids)
+                        except Exception:
+                            pass
                         if streaming:
                             # Remove already sent text (vllm concatenates)
                             delta_text = text_output.removeprefix(generated_text)
-                            yield backend_pb2.Reply(message=bytes(delta_text, encoding='utf-8'))
+                            yield backend_pb2.Reply(
+                                message=bytes(delta_text, encoding='utf-8'),
+                                tokens=completion_tokens,
+                                prompt_tokens=prompt_tokens,
+                            )
                         generated_text = text_output
 
             if not streaming:
-                yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
+                # Build optional ChatDelta with parsed reasoning / tool calls
+                chat_deltas = []
+                content_text = generated_text
+                reasoning_text = ""
+                tool_call_deltas = []
+
+                if self.reasoning_parser_cls is not None:
+                    try:
+                        parser = self.reasoning_parser_cls(self.tokenizer) if self.tokenizer else self.reasoning_parser_cls()
+                        reasoning_text, content_text = parser.extract_reasoning_content(content_text, request=None)
+                        reasoning_text = reasoning_text or ""
+                        content_text = content_text or ""
+                    except Exception as e:
+                        print(f"reasoning_parser failed: {e}", file=sys.stderr)
+
+                if self.tool_parser_cls is not None:
+                    try:
+                        parser = self.tool_parser_cls(self.tokenizer) if self.tokenizer else self.tool_parser_cls()
+                        tool_info = parser.extract_tool_calls(content_text, request=None)
+                        if getattr(tool_info, "tools_called", False):
+                            content_text = tool_info.content or ""
+                            for tc in tool_info.tool_calls or []:
+                                fn = getattr(tc, "function", None)
+                                tool_call_deltas.append(backend_pb2.ToolCallDelta(
+                                    index=getattr(tc, "index", 0) or 0,
+                                    id=getattr(tc, "id", "") or "",
+                                    name=getattr(fn, "name", "") if fn else "",
+                                    arguments=getattr(fn, "arguments", "") if fn else "",
+                                ))
+                    except Exception as e:
+                        print(f"tool_parser failed: {e}", file=sys.stderr)
+
+                if self.tool_parser_cls is not None or self.reasoning_parser_cls is not None:
+                    chat_deltas.append(backend_pb2.ChatDelta(
+                        content=content_text,
+                        reasoning_content=reasoning_text,
+                        tool_calls=tool_call_deltas,
+                    ))
+
+                yield backend_pb2.Reply(
+                    message=bytes(generated_text, encoding='utf-8'),
+                    tokens=completion_tokens,
+                    prompt_tokens=prompt_tokens,
+                    chat_deltas=chat_deltas,
+                )
 
         except Exception as err:
             print(f"Error in Predict: {err}", file=sys.stderr)
@@ -647,6 +742,37 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             traceback.print_exc()
             return backend_pb2.Result(success=False, message=f"Error generating TTS: {err}")
 
+    def TokenizeString(self, request, context):
+        if not hasattr(self, 'tokenizer') or self.tokenizer is None:
+            context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
+            context.set_details("Model/tokenizer not loaded")
+            return backend_pb2.TokenizationResponse()
+        try:
+            tokens = self.tokenizer.encode(request.Prompt)
+            return backend_pb2.TokenizationResponse(length=len(tokens), tokens=tokens)
+        except Exception as e:
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(str(e))
+            return backend_pb2.TokenizationResponse()
+
+    def Free(self, request, context):
+        try:
+            if hasattr(self, 'omni'):
+                del self.omni
+            if hasattr(self, 'tokenizer'):
+                del self.tokenizer
+            self.tool_parser_cls = None
+            self.reasoning_parser_cls = None
+            gc.collect()
+            try:
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            except Exception:
+                pass
+            return backend_pb2.Result(success=True, message="Model freed")
+        except Exception as e:
+            return backend_pb2.Result(success=False, message=str(e))
+
 
 def serve(address):
     server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 07323c424..95ae95a9d 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -5,6 +5,9 @@ import argparse
 import signal
 import sys
 import os
+import json
+import time
+import gc
 from typing import List
 from PIL import Image
 
@@ -26,6 +29,25 @@ from vllm.assets.video import VideoAsset
 import base64
 import io
 
+# Version-compat imports — wrap in try/except for older vLLM versions
+try:
+    from vllm.tool_parsers import ToolParserManager
+    HAS_TOOL_PARSERS = True
+except ImportError:
+    HAS_TOOL_PARSERS = False
+
+try:
+    from vllm.reasoning import ReasoningParserManager
+    HAS_REASONING_PARSERS = True
+except ImportError:
+    HAS_REASONING_PARSERS = False
+
+try:
+    from vllm.sampling_params import GuidedDecodingParams
+    HAS_GUIDED_DECODING = True
+except ImportError:
+    HAS_GUIDED_DECODING = False
+
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -69,6 +91,35 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                 break
         return decoded_text
 
+    def _parse_options(self, options_list):
+        """Parse Options[] key:value string list into a dict."""
+        opts = {}
+        for opt in options_list:
+            if ":" not in opt:
+                continue
+            key, value = opt.split(":", 1)
+            opts[key.strip()] = value.strip()
+        return opts
+
+    def _messages_to_dicts(self, messages):
+        """Convert proto Messages to list of dicts suitable for apply_chat_template()."""
+        result = []
+        for msg in messages:
+            d = {"role": msg.role, "content": msg.content or ""}
+            if msg.name:
+                d["name"] = msg.name
+            if msg.tool_call_id:
+                d["tool_call_id"] = msg.tool_call_id
+            if msg.reasoning_content:
+                d["reasoning_content"] = msg.reasoning_content
+            if msg.tool_calls:
+                try:
+                    d["tool_calls"] = json.loads(msg.tool_calls)
+                except json.JSONDecodeError:
+                    pass
+            result.append(d)
+        return result
+
     def Health(self, request, context):
         """
         Returns a health check message.
@@ -132,15 +183,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
 
         try:
-           engine_model_config = await self.llm.get_model_config()
-           self.tokenizer = get_tokenizer(
-               engine_model_config.tokenizer,
-               tokenizer_mode=engine_model_config.tokenizer_mode,
-               trust_remote_code=engine_model_config.trust_remote_code,
-               truncation_side="left",
-           )
+            # vLLM >= 0.14 removed get_model_config() on AsyncLLM; the tokenizer
+            # is either already loaded on the engine or can be built from the
+            # Model name directly.
+            tokenizer = None
+            if hasattr(self.llm, "get_tokenizer"):
+                try:
+                    tokenizer = await self.llm.get_tokenizer()
+                except TypeError:
+                    tokenizer = self.llm.get_tokenizer()
+                except Exception:
+                    tokenizer = None
+            if tokenizer is None and hasattr(self.llm, "tokenizer"):
+                tokenizer = self.llm.tokenizer
+            if tokenizer is None:
+                tokenizer = get_tokenizer(
+                    request.Model,
+                    trust_remote_code=bool(request.TrustRemoteCode),
+                    truncation_side="left",
+                )
+            self.tokenizer = tokenizer
         except Exception as err:
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        # Parse options for parser selection
+        opts = self._parse_options(request.Options)
+
+        # Instantiate tool/reasoning parser classes (they'll be instantiated per-request with tokenizer)
+        self.tool_parser_cls = None
+        self.reasoning_parser_cls = None
+        if HAS_TOOL_PARSERS and opts.get("tool_parser"):
+            try:
+                self.tool_parser_cls = ToolParserManager.get_tool_parser(opts["tool_parser"])
+                print(f"Loaded tool_parser: {opts['tool_parser']}", file=sys.stderr)
+            except Exception as e:
+                print(f"Failed to load tool_parser {opts.get('tool_parser')}: {e}", file=sys.stderr)
+
+        if HAS_REASONING_PARSERS and opts.get("reasoning_parser"):
+            try:
+                self.reasoning_parser_cls = ReasoningParserManager.get_reasoning_parser(opts["reasoning_parser"])
+                print(f"Loaded reasoning_parser: {opts['reasoning_parser']}", file=sys.stderr)
+            except Exception as e:
+                print(f"Failed to load reasoning_parser {opts.get('reasoning_parser')}: {e}", file=sys.stderr)
+
         print("Model loaded successfully", file=sys.stderr)
         return backend_pb2.Result(message="Model loaded successfully", success=True)
 
@@ -197,6 +282,38 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
         finally:
             await iterations.aclose()
 
+    async def TokenizeString(self, request, context):
+        if not hasattr(self, 'tokenizer') or self.tokenizer is None:
+            context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
+            context.set_details("Model/tokenizer not loaded")
+            return backend_pb2.TokenizationResponse()
+        try:
+            tokens = self.tokenizer.encode(request.Prompt)
+            return backend_pb2.TokenizationResponse(length=len(tokens), tokens=tokens)
+        except Exception as e:
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(str(e))
+            return backend_pb2.TokenizationResponse()
+
+    async def Free(self, request, context):
+        try:
+            if hasattr(self, 'llm'):
+                del self.llm
+            if hasattr(self, 'tokenizer'):
+                del self.tokenizer
+            self.tool_parser_cls = None
+            self.reasoning_parser_cls = None
+            gc.collect()
+            try:
+                import torch
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            except ImportError:
+                pass
+            return backend_pb2.Result(success=True, message="Model freed")
+        except Exception as e:
+            return backend_pb2.Result(success=False, message=str(e))
+
     async def _predict(self, request, context, streaming=False):
         # Build the sampling parameters
         # NOTE: this must stay in sync with the vllm backend
@@ -222,7 +339,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             "SkipSpecialTokens": "skip_special_tokens",
             "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
             "TruncatePromptTokens": "truncate_prompt_tokens",
-            "GuidedDecoding": "guided_decoding",
         }
 
         sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
@@ -233,6 +349,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                 if value not in (None, 0, [], False):
                     setattr(sampling_params, param_field, value)
 
+        # Guided decoding: use Grammar field to pass JSON schema or BNF
+        if HAS_GUIDED_DECODING and request.Grammar:
+            try:
+                json.loads(request.Grammar)  # valid JSON = JSON schema
+                sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar)
+            except json.JSONDecodeError:
+                sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
+
         # Extract image paths and process images
         prompt = request.Prompt
 
@@ -244,7 +368,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
         # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
         if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
-            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
+            messages_dicts = self._messages_to_dicts(request.Messages)
+            template_kwargs = {"tokenize": False, "add_generation_prompt": True}
+
+            # Pass tools for tool calling
+            if request.Tools:
+                try:
+                    template_kwargs["tools"] = json.loads(request.Tools)
+                except json.JSONDecodeError:
+                    pass
+
+            # Enable thinking mode if requested
+            if request.Metadata.get("enable_thinking", "").lower() == "true":
+                template_kwargs["enable_thinking"] = True
+
+            try:
+                prompt = self.tokenizer.apply_chat_template(messages_dicts, **template_kwargs)
+            except TypeError:
+                # Some tokenizers don't support tools/enable_thinking kwargs — retry without them
+                prompt = self.tokenizer.apply_chat_template(
+                    messages_dicts, tokenize=False, add_generation_prompt=True
+                )
 
         # Generate text using the LLM engine
         request_id = random_uuid()
@@ -265,25 +409,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
         # Stream the results
         generated_text = ""
+        last_output = None
         try:
             async for request_output in outputs:
                 iteration_text = request_output.outputs[0].text
+                last_output = request_output
 
                 if streaming:
                     # Remove text already sent as vllm concatenates the text from previous yields
                     delta_iteration_text = iteration_text.removeprefix(generated_text)
                     # Send the partial result
-                    yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8'))
+                    yield backend_pb2.Reply(
+                        message=bytes(delta_iteration_text, encoding='utf-8'),
+                        chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
+                    )
 
                 # Keep track of text generated
                 generated_text = iteration_text
         finally:
             await outputs.aclose()
 
-        # If streaming, we already sent everything
-        if streaming:
-            return
-
         # Remove the image files from /tmp folder
         for img_path in image_paths:
             try:
@@ -291,8 +436,99 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             except Exception as e:
                 print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
 
-        # Sending the final generated text
-        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
+        # Parse reasoning and tool calls from final text using vLLM's native parsers
+        content = generated_text
+        reasoning_content = ""
+        tool_calls_proto = []
+
+        if self.reasoning_parser_cls:
+            try:
+                rp = self.reasoning_parser_cls(self.tokenizer)
+                r, c = rp.extract_reasoning(generated_text, request=None)
+                reasoning_content = r or ""
+                content = c if c is not None else generated_text
+            except Exception as e:
+                print(f"Reasoning parser error: {e}", file=sys.stderr)
+
+        if self.tool_parser_cls and request.Tools:
+            try:
+                tools = json.loads(request.Tools)
+                # Some concrete parsers only accept the tokenizer; only the
+                # abstract base declares the tools kwarg. Try with tools first,
+                # fall back to tokenizer-only.
+                try:
+                    tp = self.tool_parser_cls(self.tokenizer, tools=tools)
+                except TypeError:
+                    tp = self.tool_parser_cls(self.tokenizer)
+                info = tp.extract_tool_calls(content, request=None)
+                if info.tools_called:
+                    content = info.content or ""
+                    for i, tc in enumerate(info.tool_calls):
+                        tool_calls_proto.append(backend_pb2.ToolCallDelta(
+                            index=i,
+                            id=tc.id,
+                            name=tc.function.name,
+                            arguments=tc.function.arguments,
+                        ))
+            except Exception as e:
+                print(f"Tool parser error: {e}", file=sys.stderr)
+
+        # Extract token counts
+        prompt_tokens = 0
+        completion_tokens = 0
+        if last_output is not None:
+            try:
+                prompt_tokens = len(last_output.prompt_token_ids or [])
+            except Exception:
+                pass
+            try:
+                completion_tokens = len(last_output.outputs[0].token_ids or [])
+            except Exception:
+                pass
+
+        # Extract logprobs if requested
+        logprobs_bytes = b""
+        if last_output is not None and request.Logprobs > 0:
+            try:
+                lp = last_output.outputs[0].logprobs
+                if lp:
+                    logprobs_data = {"content": []}
+                    for token_lp_dict in lp:
+                        if token_lp_dict:
+                            first_tok_id, first_lp = next(iter(token_lp_dict.items()))
+                            logprobs_data["content"].append({
+                                "token": getattr(first_lp, "decoded_token", str(first_tok_id)),
+                                "logprob": first_lp.logprob,
+                            })
+                    logprobs_bytes = json.dumps(logprobs_data).encode("utf-8")
+            except Exception as e:
+                print(f"Logprobs extraction error: {e}", file=sys.stderr)
+
+        chat_delta = backend_pb2.ChatDelta(
+            content=content,
+            reasoning_content=reasoning_content,
+            tool_calls=tool_calls_proto,
+        )
+
+        if streaming:
+            # Final chunk with structured data
+            yield backend_pb2.Reply(
+                message=b"",
+                prompt_tokens=prompt_tokens,
+                tokens=completion_tokens,
+                chat_deltas=[chat_delta],
+                logprobs=logprobs_bytes,
+            )
+            return
+
+        # Non-streaming: single Reply with everything
+        yield backend_pb2.Reply(
+            message=bytes(content, encoding='utf-8'),
+            prompt_tokens=prompt_tokens,
+            tokens=completion_tokens,
+            chat_deltas=[chat_delta],
+            logprobs=logprobs_bytes,
+        )
 
     def load_image(self, image_path: str):
         """
diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
index 7dcd29db4..cf6fa7efe 100755
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -26,20 +26,43 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
     EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
 
-# We don't embed this into the images as it is a large dependency and not always needed.
-# Besides, the speed inference are not actually usable in the current state for production use-cases.
-if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
-        ensureVenv
-        # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
-        if [ ! -d vllm ]; then
-            git clone https://github.com/vllm-project/vllm
-        fi
-        pushd vllm
-            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.1 protobuf bitsandbytes
-            uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
-            VLLM_TARGET_DEVICE=cpu python setup.py install
-        popd
-        rm -rf vllm
-    else
-        installRequirements
+# CPU builds need unsafe-best-match to pull torch==2.10.0+cpu from the
+# pytorch test channel while still resolving transformers/vllm from pypi.
+if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
+fi
+
+# FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in
+# requirements-cpu-after.txt and compiles vllm locally against the host's
+# actual CPU. Not used by default because it takes ~30-40 minutes, but
+# kept here for hosts where the prebuilt wheel SIGILLs (CPU without the
+# required SIMD baseline, e.g. AVX-512 VNNI/BF16). Default CI uses a
+# bigger-runner with compatible hardware instead.
+if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
+    # Temporarily hide the prebuilt wheel so installRequirements doesn't
+    # pull it — the rest of the requirements files (base deps, torch,
+    # transformers) are still installed normally.
+    _cpu_after="${backend_dir}/requirements-cpu-after.txt"
+    _cpu_after_bak=""
+    if [ -f "${_cpu_after}" ]; then
+        _cpu_after_bak="${_cpu_after}.from-source.bak"
+        mv "${_cpu_after}" "${_cpu_after_bak}"
+    fi
+    installRequirements
+    if [ -n "${_cpu_after_bak}" ]; then
+        mv "${_cpu_after_bak}" "${_cpu_after}"
+    fi
+
+    # Build vllm from source against the installed torch.
+    # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/
+    _vllm_src=$(mktemp -d)
+    trap 'rm -rf "${_vllm_src}"' EXIT
+    git clone --depth 1 https://github.com/vllm-project/vllm "${_vllm_src}/vllm"
+    pushd "${_vllm_src}/vllm"
+        uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm
+        # Respect pre-installed torch version — skip vllm's own requirements-build.txt torch pin.
+        VLLM_TARGET_DEVICE=cpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
+    popd
+else
+    installRequirements
 fi
diff --git a/backend/python/vllm/package.sh b/backend/python/vllm/package.sh
new file mode 100755
index 000000000..3c4ba8c19
--- /dev/null
+++ b/backend/python/vllm/package.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Script to package runtime shared libraries for the vllm backend.
+#
+# The final Dockerfile.python stage is FROM scratch, so system libraries
+# must be explicitly copied into ${BACKEND}/lib so the backend can run on
+# any host without installing them. libbackend.sh automatically adds that
+# directory to LD_LIBRARY_PATH at run time.
+#
+# vllm's CPU C++ extension (vllm._C) dlopens libnuma.so.1 at import time;
+# if it's missing, the _C_utils torch ops are never registered and the
+# engine crashes with AttributeError on init_cpu_threads_env. libgomp is
+# used by torch's CPU kernels; on some stripped-down hosts it's also
+# absent, so we bundle it too.
+
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+LIB_DIR="${CURDIR}/lib"
+mkdir -p "${LIB_DIR}"
+
+copy_with_symlinks() {
+    local soname="$1"
+    local hit=""
+    for dir in /usr/lib/x86_64-linux-gnu /usr/lib/aarch64-linux-gnu /lib/x86_64-linux-gnu /lib/aarch64-linux-gnu /usr/lib /lib; do
+        if [ -e "${dir}/${soname}" ]; then
+            hit="${dir}/${soname}"
+            break
+        fi
+    done
+    if [ -z "${hit}" ]; then
+        echo "warning: ${soname} not found in standard lib paths" >&2
+        return 0
+    fi
+    # Follow the symlink to the real file, copy it, then recreate the symlink.
+    local real
+    real=$(readlink -f "${hit}")
+    cp -v "${real}" "${LIB_DIR}/"
+    local real_base
+    real_base=$(basename "${real}")
+    if [ "${real_base}" != "${soname}" ]; then
+        ln -sf "${real_base}" "${LIB_DIR}/${soname}"
+    fi
+}
+
+copy_with_symlinks libnuma.so.1
+copy_with_symlinks libgomp.so.1
+
+echo "vllm packaging completed successfully"
+ls -liah "${LIB_DIR}/"
diff --git a/backend/python/vllm/requirements-after.txt b/backend/python/vllm/requirements-after.txt
index 76f11f154..b5000e6ca 100644
--- a/backend/python/vllm/requirements-after.txt
+++ b/backend/python/vllm/requirements-after.txt
@@ -1 +1,2 @@
-vllm
\ No newline at end of file
+# vllm is installed per-acceleration in requirements-{profile}-after.txt
+# (cublas12, hipblas, intel, cpu)
diff --git a/backend/python/vllm/requirements-cpu-after.txt b/backend/python/vllm/requirements-cpu-after.txt
new file mode 100644
index 000000000..e5e4908f7
--- /dev/null
+++ b/backend/python/vllm/requirements-cpu-after.txt
@@ -0,0 +1,2 @@
+vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_x86_64.whl ; platform_machine == "x86_64"
+vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_aarch64.whl ; platform_machine == "aarch64"
diff --git a/backend/python/vllm/requirements-cpu.txt b/backend/python/vllm/requirements-cpu.txt
index 16c7cbac5..5eeb8a708 100644
--- a/backend/python/vllm/requirements-cpu.txt
+++ b/backend/python/vllm/requirements-cpu.txt
@@ -1,3 +1,6 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch==2.7.0
-transformers
\ No newline at end of file
+torch==2.9.1+cpu
+torchvision
+torchaudio
+transformers
diff --git a/backend/python/vllm/requirements-cublas12-after.txt b/backend/python/vllm/requirements-cublas12-after.txt
index 9251ba608..cab27c888 100644
--- a/backend/python/vllm/requirements-cublas12-after.txt
+++ b/backend/python/vllm/requirements-cublas12-after.txt
@@ -1 +1,2 @@
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.7cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
+vllm
diff --git a/backend/python/vllm/requirements-hipblas-after.txt b/backend/python/vllm/requirements-hipblas-after.txt
new file mode 100644
index 000000000..e7a6c7781
--- /dev/null
+++ b/backend/python/vllm/requirements-hipblas-after.txt
@@ -0,0 +1 @@
+vllm
diff --git a/backend/python/vllm/requirements-intel-after.txt b/backend/python/vllm/requirements-intel-after.txt
new file mode 100644
index 000000000..e7a6c7781
--- /dev/null
+++ b/backend/python/vllm/requirements-intel-after.txt
@@ -0,0 +1 @@
+vllm
diff --git a/backend/python/vllm/test.py b/backend/python/vllm/test.py
index 827aa71a3..21aaf4cf7 100644
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@@ -122,6 +122,89 @@ class TestBackendServicer(unittest.TestCase):
             self.tearDown()
 
 
+    def test_messages_to_dicts(self):
+        """
+        Tests _messages_to_dicts conversion of proto Messages to dicts.
+        """
+        import sys, os
+        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+        from backend import BackendServicer
+        servicer = BackendServicer()
+        msgs = [
+            backend_pb2.Message(role="user", content="hello"),
+            backend_pb2.Message(
+                role="assistant",
+                content="",
+                tool_calls='[{"id":"call_1","type":"function","function":{"name":"foo","arguments":"{}"}}]',
+                reasoning_content="thinking...",
+            ),
+            backend_pb2.Message(role="tool", content="result", name="foo", tool_call_id="call_1"),
+        ]
+        result = servicer._messages_to_dicts(msgs)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0], {"role": "user", "content": "hello"})
+        self.assertEqual(result[1]["reasoning_content"], "thinking...")
+        self.assertIsInstance(result[1]["tool_calls"], list)
+        self.assertEqual(result[1]["tool_calls"][0]["id"], "call_1")
+        self.assertEqual(result[2]["tool_call_id"], "call_1")
+        self.assertEqual(result[2]["name"], "foo")
+
+    def test_parse_options(self):
+        """
+        Tests _parse_options correctly parses key:value strings.
+        """
+        import sys, os
+        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+        from backend import BackendServicer
+        servicer = BackendServicer()
+        opts = servicer._parse_options([
+            "tool_parser:hermes",
+            "reasoning_parser:deepseek_r1",
+            "invalid_no_colon",
+            "key_with_colons:a:b:c",
+        ])
+        self.assertEqual(opts["tool_parser"], "hermes")
+        self.assertEqual(opts["reasoning_parser"], "deepseek_r1")
+        self.assertEqual(opts["key_with_colons"], "a:b:c")
+        self.assertNotIn("invalid_no_colon", opts)
+
+    def test_tokenize_string(self):
+        """
+        Tests the TokenizeString RPC returns valid tokens.
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+                resp = stub.TokenizeString(backend_pb2.PredictOptions(Prompt="Hello world"))
+                self.assertGreater(resp.length, 0)
+                self.assertEqual(len(resp.tokens), resp.length)
+        except Exception as err:
+            print(err)
+            self.fail("TokenizeString service failed")
+        finally:
+            self.tearDown()
+
+    def test_free(self):
+        """
+        Tests the Free RPC doesn't crash.
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+                free_resp = stub.Free(backend_pb2.HealthMessage())
+                self.assertTrue(free_resp.success)
+        except Exception as err:
+            print(err)
+            self.fail("Free service failed")
+        finally:
+            self.tearDown()
+
     def test_embedding(self):
         """
         This method tests if the embeddings are generated successfully
diff --git a/core/config/backend_hooks.go b/core/config/backend_hooks.go
new file mode 100644
index 000000000..8b2403cbb
--- /dev/null
+++ b/core/config/backend_hooks.go
@@ -0,0 +1,30 @@
+package config
+
+// BackendDefaultsHook is called during Prepare() and can modify cfg.
+// Only fills in values that are not already set by the user.
+type BackendDefaultsHook func(cfg *ModelConfig, modelPath string)
+
+var backendHooks = map[string][]BackendDefaultsHook{}
+
+// RegisterBackendHook registers a hook for a backend name.
+// Special keys:
+//   - "*"  = global catch-all, runs for EVERY backend (before specific hooks)
+//   - ""   = runs only when cfg.Backend is empty (auto-detect case)
+//   - "vllm", "llama-cpp" etc. = runs only for that specific backend
+//
+// Multiple hooks per key are supported; they run in registration order.
+func RegisterBackendHook(backend string, hook BackendDefaultsHook) {
+	backendHooks[backend] = append(backendHooks[backend], hook)
+}
+
+// runBackendHooks executes hooks in order:
+//  1. "*" (global) hooks for every backend
+//  2. Backend-specific hooks for cfg.Backend (includes "" when backend is empty)
+func runBackendHooks(cfg *ModelConfig, modelPath string) {
+	for _, h := range backendHooks["*"] {
+		h(cfg, modelPath)
+	}
+	for _, h := range backendHooks[cfg.Backend] {
+		h(cfg, modelPath)
+	}
+}
diff --git a/core/config/guesser.go b/core/config/guesser.go
deleted file mode 100644
index e4ca5b141..000000000
--- a/core/config/guesser.go
+++ /dev/null
@@ -1,46 +0,0 @@
-package config
-
-import (
-	"os"
-	"path/filepath"
-
-	gguf "github.com/gpustack/gguf-parser-go"
-	"github.com/mudler/xlog"
-)
-
-func guessDefaultsFromFile(cfg *ModelConfig, modelPath string, defaultCtx int) {
-	if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
-		xlog.Debug("guessDefaultsFromFile: guessing disabled with LOCALAI_DISABLE_GUESSING")
-		return
-	}
-
-	if modelPath == "" {
-		xlog.Debug("guessDefaultsFromFile: modelPath is empty")
-		return
-	}
-
-	// We try to guess only if we don't have a template defined already
-	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
-
-	defer func() {
-		if r := recover(); r != nil {
-			xlog.Error("guessDefaultsFromFile: panic while parsing gguf file")
-		}
-	}()
-
-	defer func() {
-		if cfg.ContextSize == nil {
-			if defaultCtx == 0 {
-				defaultCtx = defaultContextSize
-			}
-			cfg.ContextSize = &defaultCtx
-		}
-	}()
-
-	// try to parse the gguf file
-	f, err := gguf.ParseGGUFFile(guessPath)
-	if err == nil {
-		guessGGUFFromFile(cfg, f, defaultCtx)
-		return
-	}
-}
diff --git a/core/config/hooks_llamacpp.go b/core/config/hooks_llamacpp.go
new file mode 100644
index 000000000..7c2640cee
--- /dev/null
+++ b/core/config/hooks_llamacpp.go
@@ -0,0 +1,46 @@
+package config
+
+import (
+	"os"
+	"path/filepath"
+
+	gguf "github.com/gpustack/gguf-parser-go"
+	"github.com/mudler/xlog"
+)
+
+func init() {
+	// Register for both explicit llama-cpp and empty backend (auto-detect from GGUF file)
+	RegisterBackendHook("llama-cpp", llamaCppDefaults)
+	RegisterBackendHook("", llamaCppDefaults)
+}
+
+func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
+	if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
+		xlog.Debug("llamaCppDefaults: guessing disabled")
+		return
+	}
+	if modelPath == "" {
+		return
+	}
+
+	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
+
+	defer func() {
+		if r := recover(); r != nil {
+			xlog.Error("llamaCppDefaults: panic while parsing gguf file")
+		}
+	}()
+
+	// Default context size if not set, regardless of whether GGUF parsing succeeds
+	defer func() {
+		if cfg.ContextSize == nil {
+			ctx := defaultContextSize
+			cfg.ContextSize = &ctx
+		}
+	}()
+
+	f, err := gguf.ParseGGUFFile(guessPath)
+	if err == nil {
+		guessGGUFFromFile(cfg, f, 0)
+	}
+}
diff --git a/core/config/hooks_test.go b/core/config/hooks_test.go
new file mode 100644
index 000000000..b97077564
--- /dev/null
+++ b/core/config/hooks_test.go
@@ -0,0 +1,114 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Backend hooks and parser defaults", func() {
+	Context("MatchParserDefaults", func() {
+		It("matches Qwen3 family", func() {
+			parsers := MatchParserDefaults("Qwen/Qwen3-8B")
+			Expect(parsers).NotTo(BeNil())
+			Expect(parsers["tool_parser"]).To(Equal("hermes"))
+			Expect(parsers["reasoning_parser"]).To(Equal("qwen3"))
+		})
+
+		It("matches Qwen3.5 with longest-prefix-first", func() {
+			parsers := MatchParserDefaults("Qwen/Qwen3.5-9B")
+			Expect(parsers).NotTo(BeNil())
+			Expect(parsers["tool_parser"]).To(Equal("qwen3_xml"))
+		})
+
+		It("matches Llama-3.3 not Llama-3.2", func() {
+			parsers := MatchParserDefaults("meta/Llama-3.3-70B-Instruct")
+			Expect(parsers).NotTo(BeNil())
+			Expect(parsers["tool_parser"]).To(Equal("llama3_json"))
+		})
+
+		It("matches deepseek-r1", func() {
+			parsers := MatchParserDefaults("deepseek-ai/DeepSeek-R1")
+			Expect(parsers).NotTo(BeNil())
+			Expect(parsers["reasoning_parser"]).To(Equal("deepseek_r1"))
+			Expect(parsers["tool_parser"]).To(Equal("deepseek_v3"))
+		})
+
+		It("returns nil for unknown families", func() {
+			Expect(MatchParserDefaults("acme/unknown-model-xyz")).To(BeNil())
+		})
+	})
+
+	Context("Backend hook registration and execution", func() {
+		It("runs registered hook for a backend", func() {
+			called := false
+			RegisterBackendHook("test-backend-hook", func(cfg *ModelConfig, modelPath string) {
+				called = true
+				cfg.Description = "modified-by-hook"
+			})
+
+			cfg := &ModelConfig{
+				Backend: "test-backend-hook",
+			}
+			// Use the public Prepare path indirectly is heavy; instead exercise via vllmDefaults
+			// path, but here just call RegisterBackendHook + we know runBackendHooks is internal.
+			// Verify by leveraging Prepare on a fresh ModelConfig with no model path.
+			cfg.PredictionOptions = schema.PredictionOptions{}
+
+			// Trigger via Prepare with empty options; this calls runBackendHooks internally.
+			cfg.SetDefaults()
+			Expect(called).To(BeTrue())
+			Expect(cfg.Description).To(Equal("modified-by-hook"))
+		})
+	})
+
+	Context("vllmDefaults hook", func() {
+		It("auto-sets parsers for known model families on vllm backend", func() {
+			cfg := &ModelConfig{
+				Backend: "vllm",
+				PredictionOptions: schema.PredictionOptions{
+					BasicModelRequest: schema.BasicModelRequest{
+						Model: "Qwen/Qwen3-8B",
+					},
+				},
+			}
+			cfg.SetDefaults()
+
+			foundTool := false
+			foundReasoning := false
+			for _, opt := range cfg.Options {
+				if opt == "tool_parser:hermes" {
+					foundTool = true
+				}
+				if opt == "reasoning_parser:qwen3" {
+					foundReasoning = true
+				}
+			}
+			Expect(foundTool).To(BeTrue())
+			Expect(foundReasoning).To(BeTrue())
+		})
+
+		It("does not override user-set tool_parser", func() {
+			cfg := &ModelConfig{
+				Backend: "vllm",
+				Options: []string{"tool_parser:custom"},
+				PredictionOptions: schema.PredictionOptions{
+					BasicModelRequest: schema.BasicModelRequest{
+						Model: "Qwen/Qwen3-8B",
+					},
+				},
+			}
+			cfg.SetDefaults()
+
+			count := 0
+			for _, opt := range cfg.Options {
+				if len(opt) >= len("tool_parser:") && opt[:len("tool_parser:")] == "tool_parser:" {
+					count++
+				}
+			}
+			Expect(count).To(Equal(1))
+		})
+	})
+})
diff --git a/core/config/hooks_vllm.go b/core/config/hooks_vllm.go
new file mode 100644
index 000000000..3f7abd9b3
--- /dev/null
+++ b/core/config/hooks_vllm.go
@@ -0,0 +1,85 @@
+package config
+
+import (
+	_ "embed"
+	"encoding/json"
+	"strings"
+
+	"github.com/mudler/xlog"
+)
+
+//go:embed parser_defaults.json
+var parserDefaultsJSON []byte
+
+type parserDefaultsData struct {
+	Families map[string]map[string]string `json:"families"`
+	Patterns []string                     `json:"patterns"`
+}
+
+var parsersData *parserDefaultsData
+
+func init() {
+	parsersData = &parserDefaultsData{}
+	if err := json.Unmarshal(parserDefaultsJSON, parsersData); err != nil {
+		xlog.Warn("failed to parse parser_defaults.json", "error", err)
+	}
+
+	RegisterBackendHook("vllm", vllmDefaults)
+	RegisterBackendHook("vllm-omni", vllmDefaults)
+}
+
+// MatchParserDefaults returns parser defaults for the best-matching model family.
+// Returns nil if no family matches. Used both at load time (via hook) and at import time.
+func MatchParserDefaults(modelID string) map[string]string {
+	if parsersData == nil || len(parsersData.Patterns) == 0 {
+		return nil
+	}
+	normalized := normalizeModelID(modelID)
+	for _, pattern := range parsersData.Patterns {
+		if strings.Contains(normalized, pattern) {
+			if family, ok := parsersData.Families[pattern]; ok {
+				return family
+			}
+		}
+	}
+	return nil
+}
+
+func vllmDefaults(cfg *ModelConfig, modelPath string) {
+	// Check if user already set tool_parser or reasoning_parser in Options
+	hasToolParser := false
+	hasReasoningParser := false
+	for _, opt := range cfg.Options {
+		if strings.HasPrefix(opt, "tool_parser:") {
+			hasToolParser = true
+		}
+		if strings.HasPrefix(opt, "reasoning_parser:") {
+			hasReasoningParser = true
+		}
+	}
+	if hasToolParser && hasReasoningParser {
+		return
+	}
+
+	// Try matching against Model field, then Name
+	parsers := MatchParserDefaults(cfg.Model)
+	if parsers == nil {
+		parsers = MatchParserDefaults(cfg.Name)
+	}
+	if parsers == nil {
+		return
+	}
+
+	if !hasToolParser {
+		if tp, ok := parsers["tool_parser"]; ok {
+			cfg.Options = append(cfg.Options, "tool_parser:"+tp)
+			xlog.Debug("[parser_defaults] auto-set tool_parser", "parser", tp, "model", cfg.Model)
+		}
+	}
+	if !hasReasoningParser {
+		if rp, ok := parsers["reasoning_parser"]; ok {
+			cfg.Options = append(cfg.Options, "reasoning_parser:"+rp)
+			xlog.Debug("[parser_defaults] auto-set reasoning_parser", "parser", rp, "model", cfg.Model)
+		}
+	}
+}
diff --git a/core/config/model_config.go b/core/config/model_config.go
index 5f1780b76..4185d4f3f 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -497,7 +497,12 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Debug = &trueV
 	}
 
-	guessDefaultsFromFile(cfg, lo.modelPath, ctx)
+	// If a context size was provided via LoadOptions, apply it before hooks so they
+	// don't override it with their own defaults.
+	if ctx != 0 && cfg.ContextSize == nil {
+		cfg.ContextSize = &ctx
+	}
+	runBackendHooks(cfg, lo.modelPath)
 	cfg.syncKnownUsecasesFromString()
 }
 
diff --git a/core/config/parser_defaults.json b/core/config/parser_defaults.json
new file mode 100644
index 000000000..614e6df1e
--- /dev/null
+++ b/core/config/parser_defaults.json
@@ -0,0 +1,33 @@
+{
+  "families": {
+    "qwen3.5":       {"tool_parser": "qwen3_xml", "reasoning_parser": "qwen3"},
+    "qwen3-coder":   {"tool_parser": "qwen3_xml", "reasoning_parser": "qwen3"},
+    "qwen3":         {"tool_parser": "hermes",     "reasoning_parser": "qwen3"},
+    "qwen2.5":       {"tool_parser": "hermes"},
+    "qwq":           {"reasoning_parser": "deepseek_r1"},
+    "llama-4":       {"tool_parser": "llama4_pythonic"},
+    "llama-3.3":     {"tool_parser": "llama3_json"},
+    "llama-3.2":     {"tool_parser": "llama3_json"},
+    "llama-3.1":     {"tool_parser": "llama3_json"},
+    "mistral-nemo":  {"tool_parser": "mistral", "reasoning_parser": "mistral"},
+    "mistral-small": {"tool_parser": "mistral", "reasoning_parser": "mistral"},
+    "mistral-large": {"tool_parser": "mistral", "reasoning_parser": "mistral"},
+    "magistral":     {"tool_parser": "mistral", "reasoning_parser": "mistral"},
+    "deepseek-r1":   {"tool_parser": "deepseek_v3", "reasoning_parser": "deepseek_r1"},
+    "deepseek-v3":   {"tool_parser": "deepseek_v3", "reasoning_parser": "deepseek_v3"},
+    "glm-5":         {"tool_parser": "glm47"},
+    "glm-4":         {"tool_parser": "glm45", "reasoning_parser": "glm45"},
+    "gemma-4":       {"tool_parser": "gemma4", "reasoning_parser": "gemma4"},
+    "granite-4":     {"tool_parser": "granite4", "reasoning_parser": "granite"},
+    "minimax-m2.5":  {"tool_parser": "minimax_m2", "reasoning_parser": "minimax_m2"},
+    "minimax":       {"tool_parser": "minimax_m2", "reasoning_parser": "minimax_m2"},
+    "kimi-k2":       {"tool_parser": "kimi_k2", "reasoning_parser": "kimi_k2"},
+    "nemotron":      {"reasoning_parser": "nemotron_v3"},
+    "olmo":          {"tool_parser": "olmo3", "reasoning_parser": "olmo3"},
+    "ernie":         {"tool_parser": "ernie45", "reasoning_parser": "ernie45"},
+    "phi-4":         {"tool_parser": "phi4_mini_json"},
+    "gpt-oss":       {"tool_parser": "openai", "reasoning_parser": "openai_gptoss"},
+    "hermes":        {"tool_parser": "hermes"}
+  },
+  "patterns": ["qwen3.5","qwen3-coder","qwen3","qwen2.5","qwq","llama-4","llama-3.3","llama-3.2","llama-3.1","mistral-nemo","mistral-small","mistral-large","magistral","deepseek-r1","deepseek-v3","glm-5","glm-4","gemma-4","granite-4","minimax-m2.5","minimax","kimi-k2","nemotron","olmo","ernie","phi-4","gpt-oss","hermes"]
+}
diff --git a/core/gallery/importers/vllm.go b/core/gallery/importers/vllm.go
index 88baef1fe..886405169 100644
--- a/core/gallery/importers/vllm.go
+++ b/core/gallery/importers/vllm.go
@@ -88,6 +88,18 @@ func (i *VLLMImporter) Import(details Details) (gallery.ModelConfig, error) {
 	// Apply per-model-family inference parameter defaults
 	config.ApplyInferenceDefaults(&modelConfig, details.URI)
 
+	// Auto-detect tool_parser and reasoning_parser for known model families.
+	// Surfacing them in the generated YAML lets users see and edit the choices.
+	parsers := config.MatchParserDefaults(details.URI)
+	if parsers != nil {
+		if tp, ok := parsers["tool_parser"]; ok {
+			modelConfig.Options = append(modelConfig.Options, "tool_parser:"+tp)
+		}
+		if rp, ok := parsers["reasoning_parser"]; ok {
+			modelConfig.Options = append(modelConfig.Options, "reasoning_parser:"+rp)
+		}
+	}
+
 	data, err := yaml.Marshal(modelConfig)
 	if err != nil {
 		return gallery.ModelConfig{}, err
diff --git a/core/schema/message.go b/core/schema/message.go
index 79a30352e..24407165e 100644
--- a/core/schema/message.go
+++ b/core/schema/message.go
@@ -83,8 +83,12 @@ func (messages Messages) ToProto() []*proto.Message {
 			}
 		}
 
-		// Note: tool_call_id is not in schema.Message yet
-		// Reasoning field is now available in schema.Message but not yet in proto.Message
+		if message.ToolCallID != "" {
+			protoMessages[i].ToolCallId = message.ToolCallID
+		}
+		if message.Reasoning != nil {
+			protoMessages[i].ReasoningContent = *message.Reasoning
+		}
 	}
 	return protoMessages
 }
diff --git a/core/schema/message_test.go b/core/schema/message_test.go
index cd6f514e2..8ebf3fa05 100644
--- a/core/schema/message_test.go
+++ b/core/schema/message_test.go
@@ -237,6 +237,24 @@ var _ = Describe("LLM tests", func() {
 			Expect(protoMessages[0].Content).To(Equal(""))
 		})
 
+		It("should serialize ToolCallID and Reasoning fields", func() {
+			reasoning := "thinking..."
+			messages := Messages{
+				{
+					Role:       "tool",
+					Content:    "result",
+					ToolCallID: "call_123",
+					Reasoning:  &reasoning,
+				},
+			}
+
+			protoMessages := messages.ToProto()
+
+			Expect(protoMessages).To(HaveLen(1))
+			Expect(protoMessages[0].ToolCallId).To(Equal("call_123"))
+			Expect(protoMessages[0].ReasoningContent).To(Equal("thinking..."))
+		})
+
 		It("should handle message with array content containing non-text parts", func() {
 			messages := Messages{
 				{
diff --git a/tests/e2e-backends/backend_test.go b/tests/e2e-backends/backend_test.go
index a800a7ab5..b6f59fd28 100644
--- a/tests/e2e-backends/backend_test.go
+++ b/tests/e2e-backends/backend_test.go
@@ -29,18 +29,30 @@ import (
 //
 //	BACKEND_TEST_MODEL_URL   HTTP(S) URL of a model file to download before the test.
 //	BACKEND_TEST_MODEL_FILE  Path to an already-available model file (skips download).
+//	BACKEND_TEST_MODEL_NAME  HuggingFace model id (e.g. "Qwen/Qwen2.5-0.5B-Instruct").
+//	                         Passed verbatim as ModelOptions.Model; backends like vllm
+//	                         resolve it themselves and no local file is downloaded.
 //
 // Optional:
 //
 //	BACKEND_TEST_CAPS        Comma-separated list of capabilities to exercise.
-//	                         Supported values: health, load, predict, stream, embeddings.
+//	                         Supported values: health, load, predict, stream,
+//	                         embeddings, tools.
 //	                         Defaults to "health,load,predict,stream".
 //	                         A backend that only does embeddings would set this to
 //	                         "health,load,embeddings"; an image/TTS backend that cannot
 //	                         be driven by a text prompt can set it to "health,load".
+//	                         "tools" asks the backend to extract a tool call from the
+//	                         model output into ChatDelta.tool_calls.
 //	BACKEND_TEST_PROMPT      Override the prompt used by predict/stream specs.
 //	BACKEND_TEST_CTX_SIZE    Override the context size passed to LoadModel (default 512).
 //	BACKEND_TEST_THREADS     Override Threads passed to LoadModel (default 4).
+//	BACKEND_TEST_OPTIONS     Comma-separated Options[] entries passed to LoadModel,
+//	                         e.g. "tool_parser:hermes,reasoning_parser:qwen3".
+//	BACKEND_TEST_TOOL_PROMPT Override the user prompt for the tools spec
+//	                         (default: "What's the weather like in Paris, France?").
+//	BACKEND_TEST_TOOL_NAME   Override the function name expected in the tool call
+//	                         (default: "get_weather").
 //
 // The suite is intentionally model-format-agnostic: it only ever passes the
 // file path to LoadModel, so GGUF, ONNX, safetensors, .bin etc. all work so
@@ -51,9 +63,12 @@ const (
 	capPredict    = "predict"
 	capStream     = "stream"
 	capEmbeddings = "embeddings"
+	capTools      = "tools"
 
-	defaultPrompt = "The capital of France is"
-	streamPrompt  = "Once upon a time"
+	defaultPrompt     = "The capital of France is"
+	streamPrompt      = "Once upon a time"
+	defaultToolPrompt = "What's the weather like in Paris, France?"
+	defaultToolName   = "get_weather"
 )
 
 func defaultCaps() map[string]bool {
@@ -87,12 +102,14 @@ var _ = Describe("Backend container", Ordered, func() {
 		caps      map[string]bool
 		workDir   string
 		binaryDir string
-		modelFile string
+		modelFile string // set when a local file is used
+		modelName string // set when a HuggingFace model id is used
 		addr      string
 		serverCmd *exec.Cmd
 		conn      *grpc.ClientConn
 		client    pb.BackendClient
 		prompt    string
+		options   []string
 	)
 
 	BeforeAll(func() {
@@ -101,8 +118,9 @@ var _ = Describe("Backend container", Ordered, func() {
 
 		modelURL := os.Getenv("BACKEND_TEST_MODEL_URL")
 		modelFile = os.Getenv("BACKEND_TEST_MODEL_FILE")
-		Expect(modelURL != "" || modelFile != "").To(BeTrue(),
-			"one of BACKEND_TEST_MODEL_URL or BACKEND_TEST_MODEL_FILE must be set")
+		modelName = os.Getenv("BACKEND_TEST_MODEL_NAME")
+		Expect(modelURL != "" || modelFile != "" || modelName != "").To(BeTrue(),
+			"one of BACKEND_TEST_MODEL_URL, BACKEND_TEST_MODEL_FILE, or BACKEND_TEST_MODEL_NAME must be set")
 
 		caps = parseCaps()
 		GinkgoWriter.Printf("Testing image=%q with capabilities=%v\n", image, keys(caps))
@@ -112,6 +130,15 @@ var _ = Describe("Backend container", Ordered, func() {
 			prompt = defaultPrompt
 		}
 
+		if raw := strings.TrimSpace(os.Getenv("BACKEND_TEST_OPTIONS")); raw != "" {
+			for _, opt := range strings.Split(raw, ",") {
+				opt = strings.TrimSpace(opt)
+				if opt != "" {
+					options = append(options, opt)
+				}
+			}
+		}
+
 		var err error
 		workDir, err = os.MkdirTemp("", "backend-e2e-*")
 		Expect(err).NotTo(HaveOccurred())
@@ -122,8 +149,8 @@ var _ = Describe("Backend container", Ordered, func() {
 		extractImage(image, binaryDir)
 		Expect(filepath.Join(binaryDir, "run.sh")).To(BeAnExistingFile())
 
-		// Download the model once if not provided.
-		if modelFile == "" {
+		// Download the model once if not provided and no HF name given.
+		if modelFile == "" && modelName == "" {
 			modelFile = filepath.Join(workDir, "model.bin")
 			downloadFile(modelURL, modelFile)
 		}
@@ -196,16 +223,27 @@ var _ = Describe("Backend container", Ordered, func() {
 		ctxSize := envInt32("BACKEND_TEST_CTX_SIZE", 512)
 		threads := envInt32("BACKEND_TEST_THREADS", 4)
 
-		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+		// Prefer a HuggingFace model id when provided (e.g. for vllm);
+		// otherwise fall back to a downloaded/local file path.
+		modelRef := modelFile
+		var modelPath string
+		if modelName != "" {
+			modelRef = modelName
+		} else {
+			modelPath = modelFile
+		}
+
+		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
 		defer cancel()
 		res, err := client.LoadModel(ctx, &pb.ModelOptions{
-			Model:       modelFile,
-			ModelFile:   modelFile,
+			Model:       modelRef,
+			ModelFile:   modelPath,
 			ContextSize: ctxSize,
 			Threads:     threads,
 			NGPULayers:  0,
 			MMap:        true,
 			NBatch:      128,
+			Options:     options,
 		})
 		Expect(err).NotTo(HaveOccurred())
 		Expect(res.GetSuccess()).To(BeTrue(), "LoadModel failed: %s", res.GetMessage())
@@ -275,6 +313,78 @@ var _ = Describe("Backend container", Ordered, func() {
 		Expect(res.GetEmbeddings()).NotTo(BeEmpty(), "Embedding returned empty vector")
 		GinkgoWriter.Printf("Embedding: %d dims\n", len(res.GetEmbeddings()))
 	})
+
+	It("extracts tool calls into ChatDelta", func() {
+		if !caps[capTools] {
+			Skip("tools capability not enabled")
+		}
+
+		toolPrompt := os.Getenv("BACKEND_TEST_TOOL_PROMPT")
+		if toolPrompt == "" {
+			toolPrompt = defaultToolPrompt
+		}
+		toolName := os.Getenv("BACKEND_TEST_TOOL_NAME")
+		if toolName == "" {
+			toolName = defaultToolName
+		}
+
+		toolsJSON := fmt.Sprintf(`[{
+			"type": "function",
+			"function": {
+				"name": %q,
+				"description": "Get the current weather for a location",
+				"parameters": {
+					"type": "object",
+					"properties": {
+						"location": {
+							"type": "string",
+							"description": "The city and state, e.g. San Francisco, CA"
+						}
+					},
+					"required": ["location"]
+				}
+			}
+		}]`, toolName)
+
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+		defer cancel()
+		res, err := client.Predict(ctx, &pb.PredictOptions{
+			Messages: []*pb.Message{
+				{Role: "system", Content: "You are a helpful assistant. Use the provided tool when the user asks about weather."},
+				{Role: "user", Content: toolPrompt},
+			},
+			Tools:                toolsJSON,
+			ToolChoice:           "auto",
+			UseTokenizerTemplate: true,
+			Tokens:               200,
+			Temperature:          0.1,
+		})
+		Expect(err).NotTo(HaveOccurred())
+
+		// Collect tool calls from every delta — some backends emit a single
+		// final delta, others stream incremental pieces in one Reply.
+		var toolCalls []*pb.ToolCallDelta
+		for _, delta := range res.GetChatDeltas() {
+			toolCalls = append(toolCalls, delta.GetToolCalls()...)
+		}
+
+		GinkgoWriter.Printf("Tool call: raw=%q deltas=%d tool_calls=%d\n",
+			string(res.GetMessage()), len(res.GetChatDeltas()), len(toolCalls))
+
+		Expect(toolCalls).NotTo(BeEmpty(),
+			"Predict did not return any ToolCallDelta. raw=%q", string(res.GetMessage()))
+
+		matched := false
+		for _, tc := range toolCalls {
+			GinkgoWriter.Printf("  - idx=%d id=%q name=%q args=%q\n",
+				tc.GetIndex(), tc.GetId(), tc.GetName(), tc.GetArguments())
+			if tc.GetName() == toolName {
+				matched = true
+			}
+		}
+		Expect(matched).To(BeTrue(),
+			"Expected a tool call named %q in ChatDelta.tool_calls", toolName)
+	})
 })
 
 // extractImage runs `docker create` + `docker export` to materialise the image