mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-19 14:17:21 -04:00
Compare commits
16 Commits
dependabot
...
feat/vllm-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cd56a05c3e | ||
|
|
d74cd56b14 | ||
|
|
017bdee4e4 | ||
|
|
c4dc495ea1 | ||
|
|
ea2bbabffd | ||
|
|
329df11989 | ||
|
|
c7f444d18b | ||
|
|
e7f406169a | ||
|
|
034a60bf76 | ||
|
|
c99188f106 | ||
|
|
c2f73a987e | ||
|
|
b215843807 | ||
|
|
6786f05c64 | ||
|
|
6cf8263c30 | ||
|
|
a30719f04a | ||
|
|
40b1c6f943 |
13
.github/workflows/backend.yml
vendored
13
.github/workflows/backend.yml
vendored
@@ -53,6 +53,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2204'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-vllm'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'true'
|
||||
backend: "vllm"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
|
||||
47
.github/workflows/test-extra.yml
vendored
47
.github/workflows/test-extra.yml
vendored
@@ -31,6 +31,7 @@ jobs:
|
||||
llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }}
|
||||
llama-cpp: ${{ steps.detect.outputs.llama-cpp }}
|
||||
ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
|
||||
vllm: ${{ steps.detect.outputs.vllm }}
|
||||
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
|
||||
qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }}
|
||||
voxtral: ${{ steps.detect.outputs.voxtral }}
|
||||
@@ -501,6 +502,52 @@ jobs:
|
||||
- name: Build ik-llama-cpp backend image and run gRPC e2e tests
|
||||
run: |
|
||||
make test-extra-backend-ik-llama-cpp
|
||||
# tests-vllm-grpc is currently disabled in CI.
|
||||
#
|
||||
# The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16
|
||||
# instructions, and neither ubuntu-latest nor the bigger-runner pool
|
||||
# offers a stable CPU baseline that supports them — runners come
|
||||
# back with different hardware between runs and SIGILL on import of
|
||||
# vllm.model_executor.models.registry. Compiling vllm from source
|
||||
# via FROM_SOURCE=true works on any CPU but takes 30-50 minutes per
|
||||
# run, which is too slow for a smoke test.
|
||||
#
|
||||
# The test itself (tests/e2e-backends + make test-extra-backend-vllm)
|
||||
# is fully working and validated locally on a host with the right
|
||||
# SIMD baseline. Run it manually with:
|
||||
#
|
||||
# make test-extra-backend-vllm
|
||||
#
|
||||
# Re-enable this job once we have a self-hosted runner label with
|
||||
# guaranteed AVX-512 VNNI/BF16 support, or once the vllm project
|
||||
# publishes a CPU wheel with a wider baseline.
|
||||
#
|
||||
# tests-vllm-grpc:
|
||||
# needs: detect-changes
|
||||
# if: needs.detect-changes.outputs.vllm == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||
# runs-on: bigger-runner
|
||||
# timeout-minutes: 90
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install -y --no-install-recommends \
|
||||
# make build-essential curl unzip ca-certificates git tar
|
||||
# - name: Setup Go
|
||||
# uses: actions/setup-go@v5
|
||||
# with:
|
||||
# go-version: '1.25.4'
|
||||
# - name: Free disk space
|
||||
# run: |
|
||||
# sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
|
||||
# df -h
|
||||
# - name: Build vllm (cpu) backend image and run gRPC e2e tests
|
||||
# run: |
|
||||
# make test-extra-backend-vllm
|
||||
tests-acestep-cpp:
|
||||
needs: detect-changes
|
||||
if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||
|
||||
25
Makefile
25
Makefile
@@ -466,8 +466,14 @@ test-extra: prepare-test-extra
|
||||
## BACKEND_IMAGE Required. Docker image to test, e.g. local-ai-backend:llama-cpp.
|
||||
## BACKEND_TEST_MODEL_URL URL of a model file to download and load.
|
||||
## BACKEND_TEST_MODEL_FILE Path to an already-downloaded model (skips download).
|
||||
## BACKEND_TEST_MODEL_NAME HuggingFace repo id (e.g. Qwen/Qwen2.5-0.5B-Instruct).
|
||||
## Use this instead of MODEL_URL for backends that
|
||||
## resolve HF model ids natively (vllm, vllm-omni).
|
||||
## BACKEND_TEST_CAPS Comma-separated capabilities, default "health,load,predict,stream".
|
||||
## Adds "tools" to exercise ChatDelta tool call extraction.
|
||||
## BACKEND_TEST_PROMPT Override the prompt used in predict/stream specs.
|
||||
## BACKEND_TEST_OPTIONS Comma-separated Options[] entries forwarded to LoadModel,
|
||||
## e.g. "tool_parser:hermes,reasoning_parser:qwen3".
|
||||
##
|
||||
## Direct usage (image already built, no docker-build-* dependency):
|
||||
##
|
||||
@@ -486,9 +492,13 @@ test-extra-backend: protogen-go
|
||||
BACKEND_IMAGE="$$BACKEND_IMAGE" \
|
||||
BACKEND_TEST_MODEL_URL="$${BACKEND_TEST_MODEL_URL:-$(BACKEND_TEST_MODEL_URL)}" \
|
||||
BACKEND_TEST_MODEL_FILE="$$BACKEND_TEST_MODEL_FILE" \
|
||||
BACKEND_TEST_MODEL_NAME="$$BACKEND_TEST_MODEL_NAME" \
|
||||
BACKEND_TEST_CAPS="$$BACKEND_TEST_CAPS" \
|
||||
BACKEND_TEST_PROMPT="$$BACKEND_TEST_PROMPT" \
|
||||
go test -v -timeout 15m ./tests/e2e-backends/...
|
||||
BACKEND_TEST_OPTIONS="$$BACKEND_TEST_OPTIONS" \
|
||||
BACKEND_TEST_TOOL_PROMPT="$$BACKEND_TEST_TOOL_PROMPT" \
|
||||
BACKEND_TEST_TOOL_NAME="$$BACKEND_TEST_TOOL_NAME" \
|
||||
go test -v -timeout 30m ./tests/e2e-backends/...
|
||||
|
||||
## Convenience wrappers: build the image, then exercise it.
|
||||
test-extra-backend-llama-cpp: docker-build-llama-cpp
|
||||
@@ -497,6 +507,18 @@ test-extra-backend-llama-cpp: docker-build-llama-cpp
|
||||
test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp
|
||||
BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend
|
||||
|
||||
## vllm is resolved from a HuggingFace model id (no file download) and
|
||||
## exercises Predict + streaming + tool-call extraction via the hermes parser.
|
||||
## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
|
||||
## wheel was compiled against (AVX-512 VNNI/BF16); older CPUs will SIGILL
|
||||
## on import — on CI this means using the bigger-runner label.
|
||||
test-extra-backend-vllm: docker-build-vllm
|
||||
BACKEND_IMAGE=local-ai-backend:vllm \
|
||||
BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \
|
||||
BACKEND_TEST_CAPS=health,load,predict,stream,tools \
|
||||
BACKEND_TEST_OPTIONS=tool_parser:hermes \
|
||||
$(MAKE) test-extra-backend
|
||||
|
||||
DOCKER_IMAGE?=local-ai
|
||||
IMAGE_TYPE?=core
|
||||
BASE_IMAGE?=ubuntu:24.04
|
||||
@@ -650,6 +672,7 @@ define docker-build-backend
|
||||
--build-arg CUDA_MINOR_VERSION=$(CUDA_MINOR_VERSION) \
|
||||
--build-arg UBUNTU_VERSION=$(UBUNTU_VERSION) \
|
||||
--build-arg UBUNTU_CODENAME=$(UBUNTU_CODENAME) \
|
||||
$(if $(FROM_SOURCE),--build-arg FROM_SOURCE=$(FROM_SOURCE)) \
|
||||
$(if $(filter true,$(5)),--build-arg BACKEND=$(1)) \
|
||||
-t local-ai-backend:$(1) -f backend/Dockerfile.$(2) $(3)
|
||||
endef
|
||||
|
||||
@@ -29,6 +29,7 @@ RUN apt-get update && \
|
||||
curl python3-pip \
|
||||
python-is-python3 \
|
||||
python3-dev llvm \
|
||||
libnuma1 libgomp1 \
|
||||
python3-venv make cmake && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
@@ -195,6 +196,12 @@ COPY backend/backend.proto /${BACKEND}/backend.proto
|
||||
COPY backend/python/common/ /${BACKEND}/common
|
||||
COPY scripts/build/package-gpu-libs.sh /package-gpu-libs.sh
|
||||
|
||||
# Optional per-backend source build toggle (e.g. vllm on CPU can set
|
||||
# FROM_SOURCE=true to compile against the build host SIMD instead of
|
||||
# pulling a prebuilt wheel). Default empty — most backends ignore it.
|
||||
ARG FROM_SOURCE=""
|
||||
ENV FROM_SOURCE=${FROM_SOURCE}
|
||||
|
||||
RUN cd /${BACKEND} && PORTABLE_PYTHON=true make
|
||||
|
||||
# Package GPU libraries into the backend's lib directory
|
||||
|
||||
@@ -197,6 +197,7 @@
|
||||
amd: "rocm-vllm"
|
||||
intel: "intel-vllm"
|
||||
nvidia-cuda-12: "cuda12-vllm"
|
||||
cpu: "cpu-vllm"
|
||||
- &vllm-omni
|
||||
name: "vllm-omni"
|
||||
license: apache-2.0
|
||||
@@ -1563,6 +1564,7 @@
|
||||
nvidia: "cuda12-vllm-development"
|
||||
amd: "rocm-vllm-development"
|
||||
intel: "intel-vllm-development"
|
||||
cpu: "cpu-vllm-development"
|
||||
- !!merge <<: *vllm
|
||||
name: "cuda12-vllm"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm"
|
||||
@@ -1578,6 +1580,11 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-vllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-intel-vllm
|
||||
- !!merge <<: *vllm
|
||||
name: "cpu-vllm"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-vllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-cpu-vllm
|
||||
- !!merge <<: *vllm
|
||||
name: "cuda12-vllm-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-vllm"
|
||||
@@ -1593,6 +1600,11 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-vllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-intel-vllm
|
||||
- !!merge <<: *vllm
|
||||
name: "cpu-vllm-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-vllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-vllm
|
||||
# vllm-omni
|
||||
- !!merge <<: *vllm-omni
|
||||
name: "vllm-omni-development"
|
||||
|
||||
84
backend/python/common/vllm_utils.py
Normal file
84
backend/python/common/vllm_utils.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Shared utilities for vLLM-based backends."""
|
||||
import json
|
||||
import sys
|
||||
|
||||
|
||||
def parse_options(options_list):
|
||||
"""Parse Options[] list of 'key:value' strings into a dict.
|
||||
|
||||
Supports type inference for common cases (bool, int, float).
|
||||
Used by LoadModel to extract backend-specific options.
|
||||
"""
|
||||
opts = {}
|
||||
for opt in options_list:
|
||||
if ":" not in opt:
|
||||
continue
|
||||
key, value = opt.split(":", 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
# Try type conversion
|
||||
if value.lower() in ("true", "false"):
|
||||
opts[key] = value.lower() == "true"
|
||||
else:
|
||||
try:
|
||||
opts[key] = int(value)
|
||||
except ValueError:
|
||||
try:
|
||||
opts[key] = float(value)
|
||||
except ValueError:
|
||||
opts[key] = value
|
||||
return opts
|
||||
|
||||
|
||||
def messages_to_dicts(proto_messages):
|
||||
"""Convert proto Message objects to list of dicts for apply_chat_template().
|
||||
|
||||
Handles: role, content, name, tool_call_id, reasoning_content, tool_calls (JSON string -> list).
|
||||
"""
|
||||
result = []
|
||||
for msg in proto_messages:
|
||||
d = {"role": msg.role, "content": msg.content or ""}
|
||||
if msg.name:
|
||||
d["name"] = msg.name
|
||||
if msg.tool_call_id:
|
||||
d["tool_call_id"] = msg.tool_call_id
|
||||
if msg.reasoning_content:
|
||||
d["reasoning_content"] = msg.reasoning_content
|
||||
if msg.tool_calls:
|
||||
try:
|
||||
d["tool_calls"] = json.loads(msg.tool_calls)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
result.append(d)
|
||||
return result
|
||||
|
||||
|
||||
def setup_parsers(opts):
|
||||
"""Return (tool_parser_cls, reasoning_parser_cls) tuple from opts dict.
|
||||
|
||||
Uses vLLM's native ToolParserManager and ReasoningParserManager.
|
||||
Returns (None, None) if vLLM is not installed or parsers not available.
|
||||
"""
|
||||
tool_parser_cls = None
|
||||
reasoning_parser_cls = None
|
||||
|
||||
tool_parser_name = opts.get("tool_parser")
|
||||
reasoning_parser_name = opts.get("reasoning_parser")
|
||||
|
||||
if tool_parser_name:
|
||||
try:
|
||||
from vllm.tool_parsers import ToolParserManager
|
||||
tool_parser_cls = ToolParserManager.get_tool_parser(tool_parser_name)
|
||||
print(f"[vllm_utils] Loaded tool_parser: {tool_parser_name}", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"[vllm_utils] Failed to load tool_parser {tool_parser_name}: {e}", file=sys.stderr)
|
||||
|
||||
if reasoning_parser_name:
|
||||
try:
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
reasoning_parser_cls = ReasoningParserManager.get_reasoning_parser(reasoning_parser_name)
|
||||
print(f"[vllm_utils] Loaded reasoning_parser: {reasoning_parser_name}", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"[vllm_utils] Failed to load reasoning_parser {reasoning_parser_name}: {e}", file=sys.stderr)
|
||||
|
||||
return tool_parser_cls, reasoning_parser_cls
|
||||
@@ -17,6 +17,8 @@ import time
|
||||
import os
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import gc
|
||||
|
||||
from PIL import Image
|
||||
import torch
|
||||
@@ -30,6 +32,7 @@ import grpc
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
|
||||
from grpc_auth import get_auth_interceptors
|
||||
from vllm_utils import parse_options, messages_to_dicts, setup_parsers
|
||||
|
||||
|
||||
from vllm_omni.entrypoints.omni import Omni
|
||||
@@ -148,23 +151,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
def LoadModel(self, request, context):
|
||||
try:
|
||||
# CPU detection: if no CUDA, default vLLM target device to CPU.
|
||||
try:
|
||||
if not torch.cuda.is_available():
|
||||
os.environ.setdefault("VLLM_TARGET_DEVICE", "cpu")
|
||||
os.environ.setdefault("VLLM_CPU_KVCACHE_SPACE", "4")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print(f"Loading model {request.Model}...", file=sys.stderr)
|
||||
print(f"Request {request}", file=sys.stderr)
|
||||
|
||||
# Parse options from request.Options (key:value pairs)
|
||||
self.options = {}
|
||||
for opt in request.Options:
|
||||
if ":" not in opt:
|
||||
continue
|
||||
key, value = opt.split(":", 1)
|
||||
# Convert value to appropriate type
|
||||
if is_float(value):
|
||||
value = float(value)
|
||||
elif is_int(value):
|
||||
value = int(value)
|
||||
elif value.lower() in ["true", "false"]:
|
||||
value = value.lower() == "true"
|
||||
self.options[key] = value
|
||||
# Parse options from request.Options using shared helper
|
||||
self.options = parse_options(request.Options)
|
||||
opts = self.options
|
||||
|
||||
print(f"Options: {self.options}", file=sys.stderr)
|
||||
|
||||
@@ -244,6 +244,24 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
omni_kwargs["max_model_len"] = request.MaxModelLen
|
||||
|
||||
self.omni = Omni(**omni_kwargs)
|
||||
|
||||
# Load tokenizer for LLM/TTS so chat templates work
|
||||
if self.model_type in ("llm", "tts"):
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
self.tokenizer = get_tokenizer(
|
||||
request.Model,
|
||||
trust_remote_code=opts.get("trust_remote_code", False),
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Failed to load tokenizer: {e}", file=sys.stderr)
|
||||
self.tokenizer = None
|
||||
else:
|
||||
self.tokenizer = None
|
||||
|
||||
# Setup optional tool / reasoning parsers
|
||||
self.tool_parser_cls, self.reasoning_parser_cls = setup_parsers(opts)
|
||||
|
||||
print("Model loaded successfully", file=sys.stderr)
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
@@ -466,14 +484,32 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
# Extract prompt
|
||||
if request.Prompt:
|
||||
prompt = request.Prompt
|
||||
elif request.Messages and request.UseTokenizerTemplate:
|
||||
# Build prompt from messages (simplified - would need tokenizer for full template)
|
||||
prompt = ""
|
||||
for msg in request.Messages:
|
||||
role = msg.role
|
||||
content = msg.content
|
||||
prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
|
||||
prompt += "<|im_start|>assistant\n"
|
||||
elif request.Messages:
|
||||
if getattr(self, "tokenizer", None) is not None:
|
||||
messages_dicts = messages_to_dicts(request.Messages)
|
||||
template_kwargs = {"tokenize": False, "add_generation_prompt": True}
|
||||
if request.Tools:
|
||||
try:
|
||||
template_kwargs["tools"] = json.loads(request.Tools)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
try:
|
||||
if request.Metadata.get("enable_thinking", "").lower() == "true":
|
||||
template_kwargs["enable_thinking"] = True
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
prompt = self.tokenizer.apply_chat_template(messages_dicts, **template_kwargs)
|
||||
except TypeError:
|
||||
prompt = self.tokenizer.apply_chat_template(
|
||||
messages_dicts, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
else:
|
||||
# Fallback: basic template
|
||||
prompt = ""
|
||||
for msg in request.Messages:
|
||||
prompt += f"<|im_start|>{msg.role}\n{msg.content}<|im_end|>\n"
|
||||
prompt += "<|im_start|>assistant\n"
|
||||
else:
|
||||
yield backend_pb2.Reply(message=bytes("", 'utf-8'))
|
||||
return
|
||||
@@ -539,20 +575,79 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
# Call omni.generate() (returns generator for LLM mode)
|
||||
omni_generator = self.omni.generate([inputs], sampling_params_list)
|
||||
|
||||
# Extract text from outputs
|
||||
# Extract text from outputs and track token usage
|
||||
generated_text = ""
|
||||
prompt_tokens = 0
|
||||
completion_tokens = 0
|
||||
for stage_outputs in omni_generator:
|
||||
if stage_outputs.final_output_type == "text":
|
||||
for output in stage_outputs.request_output:
|
||||
text_output = output.outputs[0].text
|
||||
completion = output.outputs[0]
|
||||
text_output = completion.text
|
||||
# Track tokens when available
|
||||
try:
|
||||
if getattr(output, "prompt_token_ids", None) is not None:
|
||||
prompt_tokens = len(output.prompt_token_ids)
|
||||
if getattr(completion, "token_ids", None) is not None:
|
||||
completion_tokens = len(completion.token_ids)
|
||||
except Exception:
|
||||
pass
|
||||
if streaming:
|
||||
# Remove already sent text (vllm concatenates)
|
||||
delta_text = text_output.removeprefix(generated_text)
|
||||
yield backend_pb2.Reply(message=bytes(delta_text, encoding='utf-8'))
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(delta_text, encoding='utf-8'),
|
||||
tokens=completion_tokens,
|
||||
prompt_tokens=prompt_tokens,
|
||||
)
|
||||
generated_text = text_output
|
||||
|
||||
if not streaming:
|
||||
yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
|
||||
# Build optional ChatDelta with parsed reasoning / tool calls
|
||||
chat_deltas = []
|
||||
content_text = generated_text
|
||||
reasoning_text = ""
|
||||
tool_call_deltas = []
|
||||
|
||||
if self.reasoning_parser_cls is not None:
|
||||
try:
|
||||
parser = self.reasoning_parser_cls(self.tokenizer) if self.tokenizer else self.reasoning_parser_cls()
|
||||
reasoning_text, content_text = parser.extract_reasoning_content(content_text, request=None)
|
||||
reasoning_text = reasoning_text or ""
|
||||
content_text = content_text or ""
|
||||
except Exception as e:
|
||||
print(f"reasoning_parser failed: {e}", file=sys.stderr)
|
||||
|
||||
if self.tool_parser_cls is not None:
|
||||
try:
|
||||
parser = self.tool_parser_cls(self.tokenizer) if self.tokenizer else self.tool_parser_cls()
|
||||
tool_info = parser.extract_tool_calls(content_text, request=None)
|
||||
if getattr(tool_info, "tools_called", False):
|
||||
content_text = tool_info.content or ""
|
||||
for tc in tool_info.tool_calls or []:
|
||||
fn = getattr(tc, "function", None)
|
||||
tool_call_deltas.append(backend_pb2.ToolCallDelta(
|
||||
index=getattr(tc, "index", 0) or 0,
|
||||
id=getattr(tc, "id", "") or "",
|
||||
name=getattr(fn, "name", "") if fn else "",
|
||||
arguments=getattr(fn, "arguments", "") if fn else "",
|
||||
))
|
||||
except Exception as e:
|
||||
print(f"tool_parser failed: {e}", file=sys.stderr)
|
||||
|
||||
if self.tool_parser_cls is not None or self.reasoning_parser_cls is not None:
|
||||
chat_deltas.append(backend_pb2.ChatDelta(
|
||||
content=content_text,
|
||||
reasoning_content=reasoning_text,
|
||||
tool_calls=tool_call_deltas,
|
||||
))
|
||||
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(generated_text, encoding='utf-8'),
|
||||
tokens=completion_tokens,
|
||||
prompt_tokens=prompt_tokens,
|
||||
chat_deltas=chat_deltas,
|
||||
)
|
||||
|
||||
except Exception as err:
|
||||
print(f"Error in Predict: {err}", file=sys.stderr)
|
||||
@@ -647,6 +742,37 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
traceback.print_exc()
|
||||
return backend_pb2.Result(success=False, message=f"Error generating TTS: {err}")
|
||||
|
||||
def TokenizeString(self, request, context):
|
||||
if not hasattr(self, 'tokenizer') or self.tokenizer is None:
|
||||
context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
|
||||
context.set_details("Model/tokenizer not loaded")
|
||||
return backend_pb2.TokenizationResponse()
|
||||
try:
|
||||
tokens = self.tokenizer.encode(request.Prompt)
|
||||
return backend_pb2.TokenizationResponse(length=len(tokens), tokens=tokens)
|
||||
except Exception as e:
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return backend_pb2.TokenizationResponse()
|
||||
|
||||
def Free(self, request, context):
|
||||
try:
|
||||
if hasattr(self, 'omni'):
|
||||
del self.omni
|
||||
if hasattr(self, 'tokenizer'):
|
||||
del self.tokenizer
|
||||
self.tool_parser_cls = None
|
||||
self.reasoning_parser_cls = None
|
||||
gc.collect()
|
||||
try:
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
except Exception:
|
||||
pass
|
||||
return backend_pb2.Result(success=True, message="Model freed")
|
||||
except Exception as e:
|
||||
return backend_pb2.Result(success=False, message=str(e))
|
||||
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
|
||||
@@ -5,6 +5,9 @@ import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import gc
|
||||
from typing import List
|
||||
from PIL import Image
|
||||
|
||||
@@ -26,6 +29,25 @@ from vllm.assets.video import VideoAsset
|
||||
import base64
|
||||
import io
|
||||
|
||||
# Version-compat imports — wrap in try/except for older vLLM versions
|
||||
try:
|
||||
from vllm.tool_parsers import ToolParserManager
|
||||
HAS_TOOL_PARSERS = True
|
||||
except ImportError:
|
||||
HAS_TOOL_PARSERS = False
|
||||
|
||||
try:
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
HAS_REASONING_PARSERS = True
|
||||
except ImportError:
|
||||
HAS_REASONING_PARSERS = False
|
||||
|
||||
try:
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
HAS_GUIDED_DECODING = True
|
||||
except ImportError:
|
||||
HAS_GUIDED_DECODING = False
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
||||
@@ -69,6 +91,35 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
break
|
||||
return decoded_text
|
||||
|
||||
def _parse_options(self, options_list):
|
||||
"""Parse Options[] key:value string list into a dict."""
|
||||
opts = {}
|
||||
for opt in options_list:
|
||||
if ":" not in opt:
|
||||
continue
|
||||
key, value = opt.split(":", 1)
|
||||
opts[key.strip()] = value.strip()
|
||||
return opts
|
||||
|
||||
def _messages_to_dicts(self, messages):
|
||||
"""Convert proto Messages to list of dicts suitable for apply_chat_template()."""
|
||||
result = []
|
||||
for msg in messages:
|
||||
d = {"role": msg.role, "content": msg.content or ""}
|
||||
if msg.name:
|
||||
d["name"] = msg.name
|
||||
if msg.tool_call_id:
|
||||
d["tool_call_id"] = msg.tool_call_id
|
||||
if msg.reasoning_content:
|
||||
d["reasoning_content"] = msg.reasoning_content
|
||||
if msg.tool_calls:
|
||||
try:
|
||||
d["tool_calls"] = json.loads(msg.tool_calls)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
result.append(d)
|
||||
return result
|
||||
|
||||
def Health(self, request, context):
|
||||
"""
|
||||
Returns a health check message.
|
||||
@@ -132,15 +183,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
|
||||
try:
|
||||
engine_model_config = await self.llm.get_model_config()
|
||||
self.tokenizer = get_tokenizer(
|
||||
engine_model_config.tokenizer,
|
||||
tokenizer_mode=engine_model_config.tokenizer_mode,
|
||||
trust_remote_code=engine_model_config.trust_remote_code,
|
||||
truncation_side="left",
|
||||
)
|
||||
# vLLM >= 0.14 removed get_model_config() on AsyncLLM; the tokenizer
|
||||
# is either already loaded on the engine or can be built from the
|
||||
# Model name directly.
|
||||
tokenizer = None
|
||||
if hasattr(self.llm, "get_tokenizer"):
|
||||
try:
|
||||
tokenizer = await self.llm.get_tokenizer()
|
||||
except TypeError:
|
||||
tokenizer = self.llm.get_tokenizer()
|
||||
except Exception:
|
||||
tokenizer = None
|
||||
if tokenizer is None and hasattr(self.llm, "tokenizer"):
|
||||
tokenizer = self.llm.tokenizer
|
||||
if tokenizer is None:
|
||||
tokenizer = get_tokenizer(
|
||||
request.Model,
|
||||
trust_remote_code=bool(request.TrustRemoteCode),
|
||||
truncation_side="left",
|
||||
)
|
||||
self.tokenizer = tokenizer
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
|
||||
# Parse options for parser selection
|
||||
opts = self._parse_options(request.Options)
|
||||
|
||||
# Instantiate tool/reasoning parser classes (they'll be instantiated per-request with tokenizer)
|
||||
self.tool_parser_cls = None
|
||||
self.reasoning_parser_cls = None
|
||||
if HAS_TOOL_PARSERS and opts.get("tool_parser"):
|
||||
try:
|
||||
self.tool_parser_cls = ToolParserManager.get_tool_parser(opts["tool_parser"])
|
||||
print(f"Loaded tool_parser: {opts['tool_parser']}", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"Failed to load tool_parser {opts.get('tool_parser')}: {e}", file=sys.stderr)
|
||||
|
||||
if HAS_REASONING_PARSERS and opts.get("reasoning_parser"):
|
||||
try:
|
||||
self.reasoning_parser_cls = ReasoningParserManager.get_reasoning_parser(opts["reasoning_parser"])
|
||||
print(f"Loaded reasoning_parser: {opts['reasoning_parser']}", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"Failed to load reasoning_parser {opts.get('reasoning_parser')}: {e}", file=sys.stderr)
|
||||
|
||||
print("Model loaded successfully", file=sys.stderr)
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
@@ -197,6 +282,38 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
finally:
|
||||
await iterations.aclose()
|
||||
|
||||
async def TokenizeString(self, request, context):
|
||||
if not hasattr(self, 'tokenizer') or self.tokenizer is None:
|
||||
context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
|
||||
context.set_details("Model/tokenizer not loaded")
|
||||
return backend_pb2.TokenizationResponse()
|
||||
try:
|
||||
tokens = self.tokenizer.encode(request.Prompt)
|
||||
return backend_pb2.TokenizationResponse(length=len(tokens), tokens=tokens)
|
||||
except Exception as e:
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return backend_pb2.TokenizationResponse()
|
||||
|
||||
async def Free(self, request, context):
|
||||
try:
|
||||
if hasattr(self, 'llm'):
|
||||
del self.llm
|
||||
if hasattr(self, 'tokenizer'):
|
||||
del self.tokenizer
|
||||
self.tool_parser_cls = None
|
||||
self.reasoning_parser_cls = None
|
||||
gc.collect()
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
except ImportError:
|
||||
pass
|
||||
return backend_pb2.Result(success=True, message="Model freed")
|
||||
except Exception as e:
|
||||
return backend_pb2.Result(success=False, message=str(e))
|
||||
|
||||
async def _predict(self, request, context, streaming=False):
|
||||
# Build the sampling parameters
|
||||
# NOTE: this must stay in sync with the vllm backend
|
||||
@@ -222,7 +339,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
"SkipSpecialTokens": "skip_special_tokens",
|
||||
"SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
|
||||
"TruncatePromptTokens": "truncate_prompt_tokens",
|
||||
"GuidedDecoding": "guided_decoding",
|
||||
}
|
||||
|
||||
sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
|
||||
@@ -233,6 +349,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if value not in (None, 0, [], False):
|
||||
setattr(sampling_params, param_field, value)
|
||||
|
||||
# Guided decoding: use Grammar field to pass JSON schema or BNF
|
||||
if HAS_GUIDED_DECODING and request.Grammar:
|
||||
try:
|
||||
json.loads(request.Grammar) # valid JSON = JSON schema
|
||||
sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar)
|
||||
except json.JSONDecodeError:
|
||||
sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
|
||||
|
||||
# Extract image paths and process images
|
||||
prompt = request.Prompt
|
||||
|
||||
@@ -244,7 +368,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
# If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
|
||||
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
|
||||
prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
|
||||
messages_dicts = self._messages_to_dicts(request.Messages)
|
||||
template_kwargs = {"tokenize": False, "add_generation_prompt": True}
|
||||
|
||||
# Pass tools for tool calling
|
||||
if request.Tools:
|
||||
try:
|
||||
template_kwargs["tools"] = json.loads(request.Tools)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Enable thinking mode if requested
|
||||
if request.Metadata.get("enable_thinking", "").lower() == "true":
|
||||
template_kwargs["enable_thinking"] = True
|
||||
|
||||
try:
|
||||
prompt = self.tokenizer.apply_chat_template(messages_dicts, **template_kwargs)
|
||||
except TypeError:
|
||||
# Some tokenizers don't support tools/enable_thinking kwargs — retry without them
|
||||
prompt = self.tokenizer.apply_chat_template(
|
||||
messages_dicts, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
# Generate text using the LLM engine
|
||||
request_id = random_uuid()
|
||||
@@ -265,25 +409,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
# Stream the results
|
||||
generated_text = ""
|
||||
last_output = None
|
||||
try:
|
||||
async for request_output in outputs:
|
||||
iteration_text = request_output.outputs[0].text
|
||||
last_output = request_output
|
||||
|
||||
if streaming:
|
||||
# Remove text already sent as vllm concatenates the text from previous yields
|
||||
delta_iteration_text = iteration_text.removeprefix(generated_text)
|
||||
# Send the partial result
|
||||
yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8'))
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(delta_iteration_text, encoding='utf-8'),
|
||||
chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
|
||||
)
|
||||
|
||||
# Keep track of text generated
|
||||
generated_text = iteration_text
|
||||
finally:
|
||||
await outputs.aclose()
|
||||
|
||||
# If streaming, we already sent everything
|
||||
if streaming:
|
||||
return
|
||||
|
||||
# Remove the image files from /tmp folder
|
||||
for img_path in image_paths:
|
||||
try:
|
||||
@@ -291,8 +436,99 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
except Exception as e:
|
||||
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
|
||||
|
||||
# Sending the final generated text
|
||||
yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
|
||||
# Parse reasoning and tool calls from final text using vLLM's native parsers
|
||||
content = generated_text
|
||||
reasoning_content = ""
|
||||
tool_calls_proto = []
|
||||
|
||||
if self.reasoning_parser_cls:
|
||||
try:
|
||||
rp = self.reasoning_parser_cls(self.tokenizer)
|
||||
r, c = rp.extract_reasoning(generated_text, request=None)
|
||||
reasoning_content = r or ""
|
||||
content = c if c is not None else generated_text
|
||||
except Exception as e:
|
||||
print(f"Reasoning parser error: {e}", file=sys.stderr)
|
||||
|
||||
if self.tool_parser_cls and request.Tools:
|
||||
try:
|
||||
tools = json.loads(request.Tools)
|
||||
# Some concrete parsers only accept the tokenizer; only the
|
||||
# abstract base declares the tools kwarg. Try with tools first,
|
||||
# fall back to tokenizer-only.
|
||||
try:
|
||||
tp = self.tool_parser_cls(self.tokenizer, tools=tools)
|
||||
except TypeError:
|
||||
tp = self.tool_parser_cls(self.tokenizer)
|
||||
info = tp.extract_tool_calls(content, request=None)
|
||||
if info.tools_called:
|
||||
content = info.content or ""
|
||||
for i, tc in enumerate(info.tool_calls):
|
||||
tool_calls_proto.append(backend_pb2.ToolCallDelta(
|
||||
index=i,
|
||||
id=tc.id,
|
||||
name=tc.function.name,
|
||||
arguments=tc.function.arguments,
|
||||
))
|
||||
except Exception as e:
|
||||
print(f"Tool parser error: {e}", file=sys.stderr)
|
||||
|
||||
# Extract token counts
|
||||
prompt_tokens = 0
|
||||
completion_tokens = 0
|
||||
if last_output is not None:
|
||||
try:
|
||||
prompt_tokens = len(last_output.prompt_token_ids or [])
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
completion_tokens = len(last_output.outputs[0].token_ids or [])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Extract logprobs if requested
|
||||
logprobs_bytes = b""
|
||||
if last_output is not None and request.Logprobs > 0:
|
||||
try:
|
||||
lp = last_output.outputs[0].logprobs
|
||||
if lp:
|
||||
logprobs_data = {"content": []}
|
||||
for token_lp_dict in lp:
|
||||
if token_lp_dict:
|
||||
first_tok_id, first_lp = next(iter(token_lp_dict.items()))
|
||||
logprobs_data["content"].append({
|
||||
"token": getattr(first_lp, "decoded_token", str(first_tok_id)),
|
||||
"logprob": first_lp.logprob,
|
||||
})
|
||||
logprobs_bytes = json.dumps(logprobs_data).encode("utf-8")
|
||||
except Exception as e:
|
||||
print(f"Logprobs extraction error: {e}", file=sys.stderr)
|
||||
|
||||
chat_delta = backend_pb2.ChatDelta(
|
||||
content=content,
|
||||
reasoning_content=reasoning_content,
|
||||
tool_calls=tool_calls_proto,
|
||||
)
|
||||
|
||||
if streaming:
|
||||
# Final chunk with structured data
|
||||
yield backend_pb2.Reply(
|
||||
message=b"",
|
||||
prompt_tokens=prompt_tokens,
|
||||
tokens=completion_tokens,
|
||||
chat_deltas=[chat_delta],
|
||||
logprobs=logprobs_bytes,
|
||||
)
|
||||
return
|
||||
|
||||
# Non-streaming: single Reply with everything
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(content, encoding='utf-8'),
|
||||
prompt_tokens=prompt_tokens,
|
||||
tokens=completion_tokens,
|
||||
chat_deltas=[chat_delta],
|
||||
logprobs=logprobs_bytes,
|
||||
)
|
||||
|
||||
def load_image(self, image_path: str):
|
||||
"""
|
||||
|
||||
@@ -26,20 +26,43 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
||||
fi
|
||||
|
||||
# We don't embed this into the images as it is a large dependency and not always needed.
|
||||
# Besides, the speed inference are not actually usable in the current state for production use-cases.
|
||||
if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
|
||||
ensureVenv
|
||||
# https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
|
||||
if [ ! -d vllm ]; then
|
||||
git clone https://github.com/vllm-project/vllm
|
||||
fi
|
||||
pushd vllm
|
||||
uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.1 protobuf bitsandbytes
|
||||
uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py install
|
||||
popd
|
||||
rm -rf vllm
|
||||
else
|
||||
installRequirements
|
||||
# CPU builds need unsafe-best-match to pull torch==2.10.0+cpu from the
|
||||
# pytorch test channel while still resolving transformers/vllm from pypi.
|
||||
if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
|
||||
fi
|
||||
|
||||
# FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in
|
||||
# requirements-cpu-after.txt and compiles vllm locally against the host's
|
||||
# actual CPU. Not used by default because it takes ~30-40 minutes, but
|
||||
# kept here for hosts where the prebuilt wheel SIGILLs (CPU without the
|
||||
# required SIMD baseline, e.g. AVX-512 VNNI/BF16). Default CI uses a
|
||||
# bigger-runner with compatible hardware instead.
|
||||
if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
|
||||
# Temporarily hide the prebuilt wheel so installRequirements doesn't
|
||||
# pull it — the rest of the requirements files (base deps, torch,
|
||||
# transformers) are still installed normally.
|
||||
_cpu_after="${backend_dir}/requirements-cpu-after.txt"
|
||||
_cpu_after_bak=""
|
||||
if [ -f "${_cpu_after}" ]; then
|
||||
_cpu_after_bak="${_cpu_after}.from-source.bak"
|
||||
mv "${_cpu_after}" "${_cpu_after_bak}"
|
||||
fi
|
||||
installRequirements
|
||||
if [ -n "${_cpu_after_bak}" ]; then
|
||||
mv "${_cpu_after_bak}" "${_cpu_after}"
|
||||
fi
|
||||
|
||||
# Build vllm from source against the installed torch.
|
||||
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/
|
||||
_vllm_src=$(mktemp -d)
|
||||
trap 'rm -rf "${_vllm_src}"' EXIT
|
||||
git clone --depth 1 https://github.com/vllm-project/vllm "${_vllm_src}/vllm"
|
||||
pushd "${_vllm_src}/vllm"
|
||||
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm
|
||||
# Respect pre-installed torch version — skip vllm's own requirements-build.txt torch pin.
|
||||
VLLM_TARGET_DEVICE=cpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
|
||||
popd
|
||||
else
|
||||
installRequirements
|
||||
fi
|
||||
|
||||
49
backend/python/vllm/package.sh
Executable file
49
backend/python/vllm/package.sh
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
# Script to package runtime shared libraries for the vllm backend.
|
||||
#
|
||||
# The final Dockerfile.python stage is FROM scratch, so system libraries
|
||||
# must be explicitly copied into ${BACKEND}/lib so the backend can run on
|
||||
# any host without installing them. libbackend.sh automatically adds that
|
||||
# directory to LD_LIBRARY_PATH at run time.
|
||||
#
|
||||
# vllm's CPU C++ extension (vllm._C) dlopens libnuma.so.1 at import time;
|
||||
# if it's missing, the _C_utils torch ops are never registered and the
|
||||
# engine crashes with AttributeError on init_cpu_threads_env. libgomp is
|
||||
# used by torch's CPU kernels; on some stripped-down hosts it's also
|
||||
# absent, so we bundle it too.
|
||||
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
LIB_DIR="${CURDIR}/lib"
|
||||
mkdir -p "${LIB_DIR}"
|
||||
|
||||
copy_with_symlinks() {
|
||||
local soname="$1"
|
||||
local hit=""
|
||||
for dir in /usr/lib/x86_64-linux-gnu /usr/lib/aarch64-linux-gnu /lib/x86_64-linux-gnu /lib/aarch64-linux-gnu /usr/lib /lib; do
|
||||
if [ -e "${dir}/${soname}" ]; then
|
||||
hit="${dir}/${soname}"
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ -z "${hit}" ]; then
|
||||
echo "warning: ${soname} not found in standard lib paths" >&2
|
||||
return 0
|
||||
fi
|
||||
# Follow the symlink to the real file, copy it, then recreate the symlink.
|
||||
local real
|
||||
real=$(readlink -f "${hit}")
|
||||
cp -v "${real}" "${LIB_DIR}/"
|
||||
local real_base
|
||||
real_base=$(basename "${real}")
|
||||
if [ "${real_base}" != "${soname}" ]; then
|
||||
ln -sf "${real_base}" "${LIB_DIR}/${soname}"
|
||||
fi
|
||||
}
|
||||
|
||||
copy_with_symlinks libnuma.so.1
|
||||
copy_with_symlinks libgomp.so.1
|
||||
|
||||
echo "vllm packaging completed successfully"
|
||||
ls -liah "${LIB_DIR}/"
|
||||
@@ -1 +1,2 @@
|
||||
vllm
|
||||
# vllm is installed per-acceleration in requirements-{profile}-after.txt
|
||||
# (cublas12, hipblas, intel, cpu)
|
||||
|
||||
2
backend/python/vllm/requirements-cpu-after.txt
Normal file
2
backend/python/vllm/requirements-cpu-after.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_x86_64.whl ; platform_machine == "x86_64"
|
||||
vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_aarch64.whl ; platform_machine == "aarch64"
|
||||
@@ -1,3 +1,6 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
accelerate
|
||||
torch==2.7.0
|
||||
transformers
|
||||
torch==2.9.1+cpu
|
||||
torchvision
|
||||
torchaudio
|
||||
transformers
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.7cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
|
||||
vllm
|
||||
|
||||
1
backend/python/vllm/requirements-hipblas-after.txt
Normal file
1
backend/python/vllm/requirements-hipblas-after.txt
Normal file
@@ -0,0 +1 @@
|
||||
vllm
|
||||
1
backend/python/vllm/requirements-intel-after.txt
Normal file
1
backend/python/vllm/requirements-intel-after.txt
Normal file
@@ -0,0 +1 @@
|
||||
vllm
|
||||
@@ -122,6 +122,89 @@ class TestBackendServicer(unittest.TestCase):
|
||||
self.tearDown()
|
||||
|
||||
|
||||
def test_messages_to_dicts(self):
|
||||
"""
|
||||
Tests _messages_to_dicts conversion of proto Messages to dicts.
|
||||
"""
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from backend import BackendServicer
|
||||
servicer = BackendServicer()
|
||||
msgs = [
|
||||
backend_pb2.Message(role="user", content="hello"),
|
||||
backend_pb2.Message(
|
||||
role="assistant",
|
||||
content="",
|
||||
tool_calls='[{"id":"call_1","type":"function","function":{"name":"foo","arguments":"{}"}}]',
|
||||
reasoning_content="thinking...",
|
||||
),
|
||||
backend_pb2.Message(role="tool", content="result", name="foo", tool_call_id="call_1"),
|
||||
]
|
||||
result = servicer._messages_to_dicts(msgs)
|
||||
self.assertEqual(len(result), 3)
|
||||
self.assertEqual(result[0], {"role": "user", "content": "hello"})
|
||||
self.assertEqual(result[1]["reasoning_content"], "thinking...")
|
||||
self.assertIsInstance(result[1]["tool_calls"], list)
|
||||
self.assertEqual(result[1]["tool_calls"][0]["id"], "call_1")
|
||||
self.assertEqual(result[2]["tool_call_id"], "call_1")
|
||||
self.assertEqual(result[2]["name"], "foo")
|
||||
|
||||
def test_parse_options(self):
|
||||
"""
|
||||
Tests _parse_options correctly parses key:value strings.
|
||||
"""
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from backend import BackendServicer
|
||||
servicer = BackendServicer()
|
||||
opts = servicer._parse_options([
|
||||
"tool_parser:hermes",
|
||||
"reasoning_parser:deepseek_r1",
|
||||
"invalid_no_colon",
|
||||
"key_with_colons:a:b:c",
|
||||
])
|
||||
self.assertEqual(opts["tool_parser"], "hermes")
|
||||
self.assertEqual(opts["reasoning_parser"], "deepseek_r1")
|
||||
self.assertEqual(opts["key_with_colons"], "a:b:c")
|
||||
self.assertNotIn("invalid_no_colon", opts)
|
||||
|
||||
def test_tokenize_string(self):
|
||||
"""
|
||||
Tests the TokenizeString RPC returns valid tokens.
|
||||
"""
|
||||
try:
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
|
||||
self.assertTrue(response.success)
|
||||
resp = stub.TokenizeString(backend_pb2.PredictOptions(Prompt="Hello world"))
|
||||
self.assertGreater(resp.length, 0)
|
||||
self.assertEqual(len(resp.tokens), resp.length)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("TokenizeString service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
|
||||
def test_free(self):
|
||||
"""
|
||||
Tests the Free RPC doesn't crash.
|
||||
"""
|
||||
try:
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
|
||||
self.assertTrue(response.success)
|
||||
free_resp = stub.Free(backend_pb2.HealthMessage())
|
||||
self.assertTrue(free_resp.success)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("Free service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
|
||||
def test_embedding(self):
|
||||
"""
|
||||
This method tests if the embeddings are generated successfully
|
||||
|
||||
30
core/config/backend_hooks.go
Normal file
30
core/config/backend_hooks.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package config
|
||||
|
||||
// BackendDefaultsHook is called during Prepare() and can modify cfg.
|
||||
// Only fills in values that are not already set by the user.
|
||||
type BackendDefaultsHook func(cfg *ModelConfig, modelPath string)
|
||||
|
||||
var backendHooks = map[string][]BackendDefaultsHook{}
|
||||
|
||||
// RegisterBackendHook registers a hook for a backend name.
|
||||
// Special keys:
|
||||
// - "*" = global catch-all, runs for EVERY backend (before specific hooks)
|
||||
// - "" = runs only when cfg.Backend is empty (auto-detect case)
|
||||
// - "vllm", "llama-cpp" etc. = runs only for that specific backend
|
||||
//
|
||||
// Multiple hooks per key are supported; they run in registration order.
|
||||
func RegisterBackendHook(backend string, hook BackendDefaultsHook) {
|
||||
backendHooks[backend] = append(backendHooks[backend], hook)
|
||||
}
|
||||
|
||||
// runBackendHooks executes hooks in order:
|
||||
// 1. "*" (global) hooks for every backend
|
||||
// 2. Backend-specific hooks for cfg.Backend (includes "" when backend is empty)
|
||||
func runBackendHooks(cfg *ModelConfig, modelPath string) {
|
||||
for _, h := range backendHooks["*"] {
|
||||
h(cfg, modelPath)
|
||||
}
|
||||
for _, h := range backendHooks[cfg.Backend] {
|
||||
h(cfg, modelPath)
|
||||
}
|
||||
}
|
||||
@@ -1,46 +0,0 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
gguf "github.com/gpustack/gguf-parser-go"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
func guessDefaultsFromFile(cfg *ModelConfig, modelPath string, defaultCtx int) {
|
||||
if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
|
||||
xlog.Debug("guessDefaultsFromFile: guessing disabled with LOCALAI_DISABLE_GUESSING")
|
||||
return
|
||||
}
|
||||
|
||||
if modelPath == "" {
|
||||
xlog.Debug("guessDefaultsFromFile: modelPath is empty")
|
||||
return
|
||||
}
|
||||
|
||||
// We try to guess only if we don't have a template defined already
|
||||
guessPath := filepath.Join(modelPath, cfg.ModelFileName())
|
||||
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
xlog.Error("guessDefaultsFromFile: panic while parsing gguf file")
|
||||
}
|
||||
}()
|
||||
|
||||
defer func() {
|
||||
if cfg.ContextSize == nil {
|
||||
if defaultCtx == 0 {
|
||||
defaultCtx = defaultContextSize
|
||||
}
|
||||
cfg.ContextSize = &defaultCtx
|
||||
}
|
||||
}()
|
||||
|
||||
// try to parse the gguf file
|
||||
f, err := gguf.ParseGGUFFile(guessPath)
|
||||
if err == nil {
|
||||
guessGGUFFromFile(cfg, f, defaultCtx)
|
||||
return
|
||||
}
|
||||
}
|
||||
46
core/config/hooks_llamacpp.go
Normal file
46
core/config/hooks_llamacpp.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
gguf "github.com/gpustack/gguf-parser-go"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
func init() {
|
||||
// Register for both explicit llama-cpp and empty backend (auto-detect from GGUF file)
|
||||
RegisterBackendHook("llama-cpp", llamaCppDefaults)
|
||||
RegisterBackendHook("", llamaCppDefaults)
|
||||
}
|
||||
|
||||
func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
|
||||
if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
|
||||
xlog.Debug("llamaCppDefaults: guessing disabled")
|
||||
return
|
||||
}
|
||||
if modelPath == "" {
|
||||
return
|
||||
}
|
||||
|
||||
guessPath := filepath.Join(modelPath, cfg.ModelFileName())
|
||||
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
xlog.Error("llamaCppDefaults: panic while parsing gguf file")
|
||||
}
|
||||
}()
|
||||
|
||||
// Default context size if not set, regardless of whether GGUF parsing succeeds
|
||||
defer func() {
|
||||
if cfg.ContextSize == nil {
|
||||
ctx := defaultContextSize
|
||||
cfg.ContextSize = &ctx
|
||||
}
|
||||
}()
|
||||
|
||||
f, err := gguf.ParseGGUFFile(guessPath)
|
||||
if err == nil {
|
||||
guessGGUFFromFile(cfg, f, 0)
|
||||
}
|
||||
}
|
||||
114
core/config/hooks_test.go
Normal file
114
core/config/hooks_test.go
Normal file
@@ -0,0 +1,114 @@
|
||||
package config_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("Backend hooks and parser defaults", func() {
|
||||
Context("MatchParserDefaults", func() {
|
||||
It("matches Qwen3 family", func() {
|
||||
parsers := MatchParserDefaults("Qwen/Qwen3-8B")
|
||||
Expect(parsers).NotTo(BeNil())
|
||||
Expect(parsers["tool_parser"]).To(Equal("hermes"))
|
||||
Expect(parsers["reasoning_parser"]).To(Equal("qwen3"))
|
||||
})
|
||||
|
||||
It("matches Qwen3.5 with longest-prefix-first", func() {
|
||||
parsers := MatchParserDefaults("Qwen/Qwen3.5-9B")
|
||||
Expect(parsers).NotTo(BeNil())
|
||||
Expect(parsers["tool_parser"]).To(Equal("qwen3_xml"))
|
||||
})
|
||||
|
||||
It("matches Llama-3.3 not Llama-3.2", func() {
|
||||
parsers := MatchParserDefaults("meta/Llama-3.3-70B-Instruct")
|
||||
Expect(parsers).NotTo(BeNil())
|
||||
Expect(parsers["tool_parser"]).To(Equal("llama3_json"))
|
||||
})
|
||||
|
||||
It("matches deepseek-r1", func() {
|
||||
parsers := MatchParserDefaults("deepseek-ai/DeepSeek-R1")
|
||||
Expect(parsers).NotTo(BeNil())
|
||||
Expect(parsers["reasoning_parser"]).To(Equal("deepseek_r1"))
|
||||
Expect(parsers["tool_parser"]).To(Equal("deepseek_v3"))
|
||||
})
|
||||
|
||||
It("returns nil for unknown families", func() {
|
||||
Expect(MatchParserDefaults("acme/unknown-model-xyz")).To(BeNil())
|
||||
})
|
||||
})
|
||||
|
||||
Context("Backend hook registration and execution", func() {
|
||||
It("runs registered hook for a backend", func() {
|
||||
called := false
|
||||
RegisterBackendHook("test-backend-hook", func(cfg *ModelConfig, modelPath string) {
|
||||
called = true
|
||||
cfg.Description = "modified-by-hook"
|
||||
})
|
||||
|
||||
cfg := &ModelConfig{
|
||||
Backend: "test-backend-hook",
|
||||
}
|
||||
// Use the public Prepare path indirectly is heavy; instead exercise via vllmDefaults
|
||||
// path, but here just call RegisterBackendHook + we know runBackendHooks is internal.
|
||||
// Verify by leveraging Prepare on a fresh ModelConfig with no model path.
|
||||
cfg.PredictionOptions = schema.PredictionOptions{}
|
||||
|
||||
// Trigger via Prepare with empty options; this calls runBackendHooks internally.
|
||||
cfg.SetDefaults()
|
||||
Expect(called).To(BeTrue())
|
||||
Expect(cfg.Description).To(Equal("modified-by-hook"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("vllmDefaults hook", func() {
|
||||
It("auto-sets parsers for known model families on vllm backend", func() {
|
||||
cfg := &ModelConfig{
|
||||
Backend: "vllm",
|
||||
PredictionOptions: schema.PredictionOptions{
|
||||
BasicModelRequest: schema.BasicModelRequest{
|
||||
Model: "Qwen/Qwen3-8B",
|
||||
},
|
||||
},
|
||||
}
|
||||
cfg.SetDefaults()
|
||||
|
||||
foundTool := false
|
||||
foundReasoning := false
|
||||
for _, opt := range cfg.Options {
|
||||
if opt == "tool_parser:hermes" {
|
||||
foundTool = true
|
||||
}
|
||||
if opt == "reasoning_parser:qwen3" {
|
||||
foundReasoning = true
|
||||
}
|
||||
}
|
||||
Expect(foundTool).To(BeTrue())
|
||||
Expect(foundReasoning).To(BeTrue())
|
||||
})
|
||||
|
||||
It("does not override user-set tool_parser", func() {
|
||||
cfg := &ModelConfig{
|
||||
Backend: "vllm",
|
||||
Options: []string{"tool_parser:custom"},
|
||||
PredictionOptions: schema.PredictionOptions{
|
||||
BasicModelRequest: schema.BasicModelRequest{
|
||||
Model: "Qwen/Qwen3-8B",
|
||||
},
|
||||
},
|
||||
}
|
||||
cfg.SetDefaults()
|
||||
|
||||
count := 0
|
||||
for _, opt := range cfg.Options {
|
||||
if len(opt) >= len("tool_parser:") && opt[:len("tool_parser:")] == "tool_parser:" {
|
||||
count++
|
||||
}
|
||||
}
|
||||
Expect(count).To(Equal(1))
|
||||
})
|
||||
})
|
||||
})
|
||||
85
core/config/hooks_vllm.go
Normal file
85
core/config/hooks_vllm.go
Normal file
@@ -0,0 +1,85 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
_ "embed"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
//go:embed parser_defaults.json
|
||||
var parserDefaultsJSON []byte
|
||||
|
||||
type parserDefaultsData struct {
|
||||
Families map[string]map[string]string `json:"families"`
|
||||
Patterns []string `json:"patterns"`
|
||||
}
|
||||
|
||||
var parsersData *parserDefaultsData
|
||||
|
||||
func init() {
|
||||
parsersData = &parserDefaultsData{}
|
||||
if err := json.Unmarshal(parserDefaultsJSON, parsersData); err != nil {
|
||||
xlog.Warn("failed to parse parser_defaults.json", "error", err)
|
||||
}
|
||||
|
||||
RegisterBackendHook("vllm", vllmDefaults)
|
||||
RegisterBackendHook("vllm-omni", vllmDefaults)
|
||||
}
|
||||
|
||||
// MatchParserDefaults returns parser defaults for the best-matching model family.
|
||||
// Returns nil if no family matches. Used both at load time (via hook) and at import time.
|
||||
func MatchParserDefaults(modelID string) map[string]string {
|
||||
if parsersData == nil || len(parsersData.Patterns) == 0 {
|
||||
return nil
|
||||
}
|
||||
normalized := normalizeModelID(modelID)
|
||||
for _, pattern := range parsersData.Patterns {
|
||||
if strings.Contains(normalized, pattern) {
|
||||
if family, ok := parsersData.Families[pattern]; ok {
|
||||
return family
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func vllmDefaults(cfg *ModelConfig, modelPath string) {
|
||||
// Check if user already set tool_parser or reasoning_parser in Options
|
||||
hasToolParser := false
|
||||
hasReasoningParser := false
|
||||
for _, opt := range cfg.Options {
|
||||
if strings.HasPrefix(opt, "tool_parser:") {
|
||||
hasToolParser = true
|
||||
}
|
||||
if strings.HasPrefix(opt, "reasoning_parser:") {
|
||||
hasReasoningParser = true
|
||||
}
|
||||
}
|
||||
if hasToolParser && hasReasoningParser {
|
||||
return
|
||||
}
|
||||
|
||||
// Try matching against Model field, then Name
|
||||
parsers := MatchParserDefaults(cfg.Model)
|
||||
if parsers == nil {
|
||||
parsers = MatchParserDefaults(cfg.Name)
|
||||
}
|
||||
if parsers == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if !hasToolParser {
|
||||
if tp, ok := parsers["tool_parser"]; ok {
|
||||
cfg.Options = append(cfg.Options, "tool_parser:"+tp)
|
||||
xlog.Debug("[parser_defaults] auto-set tool_parser", "parser", tp, "model", cfg.Model)
|
||||
}
|
||||
}
|
||||
if !hasReasoningParser {
|
||||
if rp, ok := parsers["reasoning_parser"]; ok {
|
||||
cfg.Options = append(cfg.Options, "reasoning_parser:"+rp)
|
||||
xlog.Debug("[parser_defaults] auto-set reasoning_parser", "parser", rp, "model", cfg.Model)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -497,7 +497,12 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||
cfg.Debug = &trueV
|
||||
}
|
||||
|
||||
guessDefaultsFromFile(cfg, lo.modelPath, ctx)
|
||||
// If a context size was provided via LoadOptions, apply it before hooks so they
|
||||
// don't override it with their own defaults.
|
||||
if ctx != 0 && cfg.ContextSize == nil {
|
||||
cfg.ContextSize = &ctx
|
||||
}
|
||||
runBackendHooks(cfg, lo.modelPath)
|
||||
cfg.syncKnownUsecasesFromString()
|
||||
}
|
||||
|
||||
|
||||
33
core/config/parser_defaults.json
Normal file
33
core/config/parser_defaults.json
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"families": {
|
||||
"qwen3.5": {"tool_parser": "qwen3_xml", "reasoning_parser": "qwen3"},
|
||||
"qwen3-coder": {"tool_parser": "qwen3_xml", "reasoning_parser": "qwen3"},
|
||||
"qwen3": {"tool_parser": "hermes", "reasoning_parser": "qwen3"},
|
||||
"qwen2.5": {"tool_parser": "hermes"},
|
||||
"qwq": {"reasoning_parser": "deepseek_r1"},
|
||||
"llama-4": {"tool_parser": "llama4_pythonic"},
|
||||
"llama-3.3": {"tool_parser": "llama3_json"},
|
||||
"llama-3.2": {"tool_parser": "llama3_json"},
|
||||
"llama-3.1": {"tool_parser": "llama3_json"},
|
||||
"mistral-nemo": {"tool_parser": "mistral", "reasoning_parser": "mistral"},
|
||||
"mistral-small": {"tool_parser": "mistral", "reasoning_parser": "mistral"},
|
||||
"mistral-large": {"tool_parser": "mistral", "reasoning_parser": "mistral"},
|
||||
"magistral": {"tool_parser": "mistral", "reasoning_parser": "mistral"},
|
||||
"deepseek-r1": {"tool_parser": "deepseek_v3", "reasoning_parser": "deepseek_r1"},
|
||||
"deepseek-v3": {"tool_parser": "deepseek_v3", "reasoning_parser": "deepseek_v3"},
|
||||
"glm-5": {"tool_parser": "glm47"},
|
||||
"glm-4": {"tool_parser": "glm45", "reasoning_parser": "glm45"},
|
||||
"gemma-4": {"tool_parser": "gemma4", "reasoning_parser": "gemma4"},
|
||||
"granite-4": {"tool_parser": "granite4", "reasoning_parser": "granite"},
|
||||
"minimax-m2.5": {"tool_parser": "minimax_m2", "reasoning_parser": "minimax_m2"},
|
||||
"minimax": {"tool_parser": "minimax_m2", "reasoning_parser": "minimax_m2"},
|
||||
"kimi-k2": {"tool_parser": "kimi_k2", "reasoning_parser": "kimi_k2"},
|
||||
"nemotron": {"reasoning_parser": "nemotron_v3"},
|
||||
"olmo": {"tool_parser": "olmo3", "reasoning_parser": "olmo3"},
|
||||
"ernie": {"tool_parser": "ernie45", "reasoning_parser": "ernie45"},
|
||||
"phi-4": {"tool_parser": "phi4_mini_json"},
|
||||
"gpt-oss": {"tool_parser": "openai", "reasoning_parser": "openai_gptoss"},
|
||||
"hermes": {"tool_parser": "hermes"}
|
||||
},
|
||||
"patterns": ["qwen3.5","qwen3-coder","qwen3","qwen2.5","qwq","llama-4","llama-3.3","llama-3.2","llama-3.1","mistral-nemo","mistral-small","mistral-large","magistral","deepseek-r1","deepseek-v3","glm-5","glm-4","gemma-4","granite-4","minimax-m2.5","minimax","kimi-k2","nemotron","olmo","ernie","phi-4","gpt-oss","hermes"]
|
||||
}
|
||||
@@ -88,6 +88,18 @@ func (i *VLLMImporter) Import(details Details) (gallery.ModelConfig, error) {
|
||||
// Apply per-model-family inference parameter defaults
|
||||
config.ApplyInferenceDefaults(&modelConfig, details.URI)
|
||||
|
||||
// Auto-detect tool_parser and reasoning_parser for known model families.
|
||||
// Surfacing them in the generated YAML lets users see and edit the choices.
|
||||
parsers := config.MatchParserDefaults(details.URI)
|
||||
if parsers != nil {
|
||||
if tp, ok := parsers["tool_parser"]; ok {
|
||||
modelConfig.Options = append(modelConfig.Options, "tool_parser:"+tp)
|
||||
}
|
||||
if rp, ok := parsers["reasoning_parser"]; ok {
|
||||
modelConfig.Options = append(modelConfig.Options, "reasoning_parser:"+rp)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := yaml.Marshal(modelConfig)
|
||||
if err != nil {
|
||||
return gallery.ModelConfig{}, err
|
||||
|
||||
@@ -83,8 +83,12 @@ func (messages Messages) ToProto() []*proto.Message {
|
||||
}
|
||||
}
|
||||
|
||||
// Note: tool_call_id is not in schema.Message yet
|
||||
// Reasoning field is now available in schema.Message but not yet in proto.Message
|
||||
if message.ToolCallID != "" {
|
||||
protoMessages[i].ToolCallId = message.ToolCallID
|
||||
}
|
||||
if message.Reasoning != nil {
|
||||
protoMessages[i].ReasoningContent = *message.Reasoning
|
||||
}
|
||||
}
|
||||
return protoMessages
|
||||
}
|
||||
|
||||
@@ -237,6 +237,24 @@ var _ = Describe("LLM tests", func() {
|
||||
Expect(protoMessages[0].Content).To(Equal(""))
|
||||
})
|
||||
|
||||
It("should serialize ToolCallID and Reasoning fields", func() {
|
||||
reasoning := "thinking..."
|
||||
messages := Messages{
|
||||
{
|
||||
Role: "tool",
|
||||
Content: "result",
|
||||
ToolCallID: "call_123",
|
||||
Reasoning: &reasoning,
|
||||
},
|
||||
}
|
||||
|
||||
protoMessages := messages.ToProto()
|
||||
|
||||
Expect(protoMessages).To(HaveLen(1))
|
||||
Expect(protoMessages[0].ToolCallId).To(Equal("call_123"))
|
||||
Expect(protoMessages[0].ReasoningContent).To(Equal("thinking..."))
|
||||
})
|
||||
|
||||
It("should handle message with array content containing non-text parts", func() {
|
||||
messages := Messages{
|
||||
{
|
||||
|
||||
@@ -29,18 +29,30 @@ import (
|
||||
//
|
||||
// BACKEND_TEST_MODEL_URL HTTP(S) URL of a model file to download before the test.
|
||||
// BACKEND_TEST_MODEL_FILE Path to an already-available model file (skips download).
|
||||
// BACKEND_TEST_MODEL_NAME HuggingFace model id (e.g. "Qwen/Qwen2.5-0.5B-Instruct").
|
||||
// Passed verbatim as ModelOptions.Model; backends like vllm
|
||||
// resolve it themselves and no local file is downloaded.
|
||||
//
|
||||
// Optional:
|
||||
//
|
||||
// BACKEND_TEST_CAPS Comma-separated list of capabilities to exercise.
|
||||
// Supported values: health, load, predict, stream, embeddings.
|
||||
// Supported values: health, load, predict, stream,
|
||||
// embeddings, tools.
|
||||
// Defaults to "health,load,predict,stream".
|
||||
// A backend that only does embeddings would set this to
|
||||
// "health,load,embeddings"; an image/TTS backend that cannot
|
||||
// be driven by a text prompt can set it to "health,load".
|
||||
// "tools" asks the backend to extract a tool call from the
|
||||
// model output into ChatDelta.tool_calls.
|
||||
// BACKEND_TEST_PROMPT Override the prompt used by predict/stream specs.
|
||||
// BACKEND_TEST_CTX_SIZE Override the context size passed to LoadModel (default 512).
|
||||
// BACKEND_TEST_THREADS Override Threads passed to LoadModel (default 4).
|
||||
// BACKEND_TEST_OPTIONS Comma-separated Options[] entries passed to LoadModel,
|
||||
// e.g. "tool_parser:hermes,reasoning_parser:qwen3".
|
||||
// BACKEND_TEST_TOOL_PROMPT Override the user prompt for the tools spec
|
||||
// (default: "What's the weather like in Paris, France?").
|
||||
// BACKEND_TEST_TOOL_NAME Override the function name expected in the tool call
|
||||
// (default: "get_weather").
|
||||
//
|
||||
// The suite is intentionally model-format-agnostic: it only ever passes the
|
||||
// file path to LoadModel, so GGUF, ONNX, safetensors, .bin etc. all work so
|
||||
@@ -51,9 +63,12 @@ const (
|
||||
capPredict = "predict"
|
||||
capStream = "stream"
|
||||
capEmbeddings = "embeddings"
|
||||
capTools = "tools"
|
||||
|
||||
defaultPrompt = "The capital of France is"
|
||||
streamPrompt = "Once upon a time"
|
||||
defaultPrompt = "The capital of France is"
|
||||
streamPrompt = "Once upon a time"
|
||||
defaultToolPrompt = "What's the weather like in Paris, France?"
|
||||
defaultToolName = "get_weather"
|
||||
)
|
||||
|
||||
func defaultCaps() map[string]bool {
|
||||
@@ -87,12 +102,14 @@ var _ = Describe("Backend container", Ordered, func() {
|
||||
caps map[string]bool
|
||||
workDir string
|
||||
binaryDir string
|
||||
modelFile string
|
||||
modelFile string // set when a local file is used
|
||||
modelName string // set when a HuggingFace model id is used
|
||||
addr string
|
||||
serverCmd *exec.Cmd
|
||||
conn *grpc.ClientConn
|
||||
client pb.BackendClient
|
||||
prompt string
|
||||
options []string
|
||||
)
|
||||
|
||||
BeforeAll(func() {
|
||||
@@ -101,8 +118,9 @@ var _ = Describe("Backend container", Ordered, func() {
|
||||
|
||||
modelURL := os.Getenv("BACKEND_TEST_MODEL_URL")
|
||||
modelFile = os.Getenv("BACKEND_TEST_MODEL_FILE")
|
||||
Expect(modelURL != "" || modelFile != "").To(BeTrue(),
|
||||
"one of BACKEND_TEST_MODEL_URL or BACKEND_TEST_MODEL_FILE must be set")
|
||||
modelName = os.Getenv("BACKEND_TEST_MODEL_NAME")
|
||||
Expect(modelURL != "" || modelFile != "" || modelName != "").To(BeTrue(),
|
||||
"one of BACKEND_TEST_MODEL_URL, BACKEND_TEST_MODEL_FILE, or BACKEND_TEST_MODEL_NAME must be set")
|
||||
|
||||
caps = parseCaps()
|
||||
GinkgoWriter.Printf("Testing image=%q with capabilities=%v\n", image, keys(caps))
|
||||
@@ -112,6 +130,15 @@ var _ = Describe("Backend container", Ordered, func() {
|
||||
prompt = defaultPrompt
|
||||
}
|
||||
|
||||
if raw := strings.TrimSpace(os.Getenv("BACKEND_TEST_OPTIONS")); raw != "" {
|
||||
for _, opt := range strings.Split(raw, ",") {
|
||||
opt = strings.TrimSpace(opt)
|
||||
if opt != "" {
|
||||
options = append(options, opt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var err error
|
||||
workDir, err = os.MkdirTemp("", "backend-e2e-*")
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
@@ -122,8 +149,8 @@ var _ = Describe("Backend container", Ordered, func() {
|
||||
extractImage(image, binaryDir)
|
||||
Expect(filepath.Join(binaryDir, "run.sh")).To(BeAnExistingFile())
|
||||
|
||||
// Download the model once if not provided.
|
||||
if modelFile == "" {
|
||||
// Download the model once if not provided and no HF name given.
|
||||
if modelFile == "" && modelName == "" {
|
||||
modelFile = filepath.Join(workDir, "model.bin")
|
||||
downloadFile(modelURL, modelFile)
|
||||
}
|
||||
@@ -196,16 +223,27 @@ var _ = Describe("Backend container", Ordered, func() {
|
||||
ctxSize := envInt32("BACKEND_TEST_CTX_SIZE", 512)
|
||||
threads := envInt32("BACKEND_TEST_THREADS", 4)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
// Prefer a HuggingFace model id when provided (e.g. for vllm);
|
||||
// otherwise fall back to a downloaded/local file path.
|
||||
modelRef := modelFile
|
||||
var modelPath string
|
||||
if modelName != "" {
|
||||
modelRef = modelName
|
||||
} else {
|
||||
modelPath = modelFile
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
|
||||
defer cancel()
|
||||
res, err := client.LoadModel(ctx, &pb.ModelOptions{
|
||||
Model: modelFile,
|
||||
ModelFile: modelFile,
|
||||
Model: modelRef,
|
||||
ModelFile: modelPath,
|
||||
ContextSize: ctxSize,
|
||||
Threads: threads,
|
||||
NGPULayers: 0,
|
||||
MMap: true,
|
||||
NBatch: 128,
|
||||
Options: options,
|
||||
})
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(res.GetSuccess()).To(BeTrue(), "LoadModel failed: %s", res.GetMessage())
|
||||
@@ -275,6 +313,78 @@ var _ = Describe("Backend container", Ordered, func() {
|
||||
Expect(res.GetEmbeddings()).NotTo(BeEmpty(), "Embedding returned empty vector")
|
||||
GinkgoWriter.Printf("Embedding: %d dims\n", len(res.GetEmbeddings()))
|
||||
})
|
||||
|
||||
It("extracts tool calls into ChatDelta", func() {
|
||||
if !caps[capTools] {
|
||||
Skip("tools capability not enabled")
|
||||
}
|
||||
|
||||
toolPrompt := os.Getenv("BACKEND_TEST_TOOL_PROMPT")
|
||||
if toolPrompt == "" {
|
||||
toolPrompt = defaultToolPrompt
|
||||
}
|
||||
toolName := os.Getenv("BACKEND_TEST_TOOL_NAME")
|
||||
if toolName == "" {
|
||||
toolName = defaultToolName
|
||||
}
|
||||
|
||||
toolsJSON := fmt.Sprintf(`[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": %q,
|
||||
"description": "Get the current weather for a location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA"
|
||||
}
|
||||
},
|
||||
"required": ["location"]
|
||||
}
|
||||
}
|
||||
}]`, toolName)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
res, err := client.Predict(ctx, &pb.PredictOptions{
|
||||
Messages: []*pb.Message{
|
||||
{Role: "system", Content: "You are a helpful assistant. Use the provided tool when the user asks about weather."},
|
||||
{Role: "user", Content: toolPrompt},
|
||||
},
|
||||
Tools: toolsJSON,
|
||||
ToolChoice: "auto",
|
||||
UseTokenizerTemplate: true,
|
||||
Tokens: 200,
|
||||
Temperature: 0.1,
|
||||
})
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
// Collect tool calls from every delta — some backends emit a single
|
||||
// final delta, others stream incremental pieces in one Reply.
|
||||
var toolCalls []*pb.ToolCallDelta
|
||||
for _, delta := range res.GetChatDeltas() {
|
||||
toolCalls = append(toolCalls, delta.GetToolCalls()...)
|
||||
}
|
||||
|
||||
GinkgoWriter.Printf("Tool call: raw=%q deltas=%d tool_calls=%d\n",
|
||||
string(res.GetMessage()), len(res.GetChatDeltas()), len(toolCalls))
|
||||
|
||||
Expect(toolCalls).NotTo(BeEmpty(),
|
||||
"Predict did not return any ToolCallDelta. raw=%q", string(res.GetMessage()))
|
||||
|
||||
matched := false
|
||||
for _, tc := range toolCalls {
|
||||
GinkgoWriter.Printf(" - idx=%d id=%q name=%q args=%q\n",
|
||||
tc.GetIndex(), tc.GetId(), tc.GetName(), tc.GetArguments())
|
||||
if tc.GetName() == toolName {
|
||||
matched = true
|
||||
}
|
||||
}
|
||||
Expect(matched).To(BeTrue(),
|
||||
"Expected a tool call named %q in ChatDelta.tool_calls", toolName)
|
||||
})
|
||||
})
|
||||
|
||||
// extractImage runs `docker create` + `docker export` to materialise the image
|
||||
|
||||
Reference in New Issue
Block a user