diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index d89ee06bf..12dcc85f1 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -53,6 +53,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2204' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-vllm' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'true' + backend: "vllm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: '' cuda-major-version: "" cuda-minor-version: "" diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index 6b590d156..afeebea82 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -31,6 +31,7 @@ jobs: llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }} llama-cpp: ${{ steps.detect.outputs.llama-cpp }} ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }} + vllm: ${{ steps.detect.outputs.vllm }} acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }} qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }} voxtral: ${{ steps.detect.outputs.voxtral }} @@ -501,6 +502,52 @@ jobs: - name: Build ik-llama-cpp backend image and run gRPC e2e tests run: | make test-extra-backend-ik-llama-cpp + # tests-vllm-grpc is currently disabled in CI. + # + # The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16 + # instructions, and neither ubuntu-latest nor the bigger-runner pool + # offers a stable CPU baseline that supports them — runners come + # back with different hardware between runs and SIGILL on import of + # vllm.model_executor.models.registry. Compiling vllm from source + # via FROM_SOURCE=true works on any CPU but takes 30-50 minutes per + # run, which is too slow for a smoke test. + # + # The test itself (tests/e2e-backends + make test-extra-backend-vllm) + # is fully working and validated locally on a host with the right + # SIMD baseline. Run it manually with: + # + # make test-extra-backend-vllm + # + # Re-enable this job once we have a self-hosted runner label with + # guaranteed AVX-512 VNNI/BF16 support, or once the vllm project + # publishes a CPU wheel with a wider baseline. + # + # tests-vllm-grpc: + # needs: detect-changes + # if: needs.detect-changes.outputs.vllm == 'true' || needs.detect-changes.outputs.run-all == 'true' + # runs-on: bigger-runner + # timeout-minutes: 90 + # steps: + # - name: Clone + # uses: actions/checkout@v6 + # with: + # submodules: true + # - name: Dependencies + # run: | + # sudo apt-get update + # sudo apt-get install -y --no-install-recommends \ + # make build-essential curl unzip ca-certificates git tar + # - name: Setup Go + # uses: actions/setup-go@v5 + # with: + # go-version: '1.25.4' + # - name: Free disk space + # run: | + # sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true + # df -h + # - name: Build vllm (cpu) backend image and run gRPC e2e tests + # run: | + # make test-extra-backend-vllm tests-acestep-cpp: needs: detect-changes if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true' diff --git a/Makefile b/Makefile index 6dce83efd..7e2e35052 100644 --- a/Makefile +++ b/Makefile @@ -466,8 +466,14 @@ test-extra: prepare-test-extra ## BACKEND_IMAGE Required. Docker image to test, e.g. local-ai-backend:llama-cpp. ## BACKEND_TEST_MODEL_URL URL of a model file to download and load. ## BACKEND_TEST_MODEL_FILE Path to an already-downloaded model (skips download). +## BACKEND_TEST_MODEL_NAME HuggingFace repo id (e.g. Qwen/Qwen2.5-0.5B-Instruct). +## Use this instead of MODEL_URL for backends that +## resolve HF model ids natively (vllm, vllm-omni). ## BACKEND_TEST_CAPS Comma-separated capabilities, default "health,load,predict,stream". +## Adds "tools" to exercise ChatDelta tool call extraction. ## BACKEND_TEST_PROMPT Override the prompt used in predict/stream specs. +## BACKEND_TEST_OPTIONS Comma-separated Options[] entries forwarded to LoadModel, +## e.g. "tool_parser:hermes,reasoning_parser:qwen3". ## ## Direct usage (image already built, no docker-build-* dependency): ## @@ -486,9 +492,13 @@ test-extra-backend: protogen-go BACKEND_IMAGE="$$BACKEND_IMAGE" \ BACKEND_TEST_MODEL_URL="$${BACKEND_TEST_MODEL_URL:-$(BACKEND_TEST_MODEL_URL)}" \ BACKEND_TEST_MODEL_FILE="$$BACKEND_TEST_MODEL_FILE" \ + BACKEND_TEST_MODEL_NAME="$$BACKEND_TEST_MODEL_NAME" \ BACKEND_TEST_CAPS="$$BACKEND_TEST_CAPS" \ BACKEND_TEST_PROMPT="$$BACKEND_TEST_PROMPT" \ - go test -v -timeout 15m ./tests/e2e-backends/... + BACKEND_TEST_OPTIONS="$$BACKEND_TEST_OPTIONS" \ + BACKEND_TEST_TOOL_PROMPT="$$BACKEND_TEST_TOOL_PROMPT" \ + BACKEND_TEST_TOOL_NAME="$$BACKEND_TEST_TOOL_NAME" \ + go test -v -timeout 30m ./tests/e2e-backends/... ## Convenience wrappers: build the image, then exercise it. test-extra-backend-llama-cpp: docker-build-llama-cpp @@ -497,6 +507,18 @@ test-extra-backend-llama-cpp: docker-build-llama-cpp test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend +## vllm is resolved from a HuggingFace model id (no file download) and +## exercises Predict + streaming + tool-call extraction via the hermes parser. +## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU +## wheel was compiled against (AVX-512 VNNI/BF16); older CPUs will SIGILL +## on import — on CI this means using the bigger-runner label. +test-extra-backend-vllm: docker-build-vllm + BACKEND_IMAGE=local-ai-backend:vllm \ + BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \ + BACKEND_TEST_CAPS=health,load,predict,stream,tools \ + BACKEND_TEST_OPTIONS=tool_parser:hermes \ + $(MAKE) test-extra-backend + DOCKER_IMAGE?=local-ai IMAGE_TYPE?=core BASE_IMAGE?=ubuntu:24.04 @@ -650,6 +672,7 @@ define docker-build-backend --build-arg CUDA_MINOR_VERSION=$(CUDA_MINOR_VERSION) \ --build-arg UBUNTU_VERSION=$(UBUNTU_VERSION) \ --build-arg UBUNTU_CODENAME=$(UBUNTU_CODENAME) \ + $(if $(FROM_SOURCE),--build-arg FROM_SOURCE=$(FROM_SOURCE)) \ $(if $(filter true,$(5)),--build-arg BACKEND=$(1)) \ -t local-ai-backend:$(1) -f backend/Dockerfile.$(2) $(3) endef diff --git a/backend/Dockerfile.python b/backend/Dockerfile.python index 5d2e6171e..f3bcf8d34 100644 --- a/backend/Dockerfile.python +++ b/backend/Dockerfile.python @@ -29,6 +29,7 @@ RUN apt-get update && \ curl python3-pip \ python-is-python3 \ python3-dev llvm \ + libnuma1 libgomp1 \ python3-venv make cmake && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -195,6 +196,12 @@ COPY backend/backend.proto /${BACKEND}/backend.proto COPY backend/python/common/ /${BACKEND}/common COPY scripts/build/package-gpu-libs.sh /package-gpu-libs.sh +# Optional per-backend source build toggle (e.g. vllm on CPU can set +# FROM_SOURCE=true to compile against the build host SIMD instead of +# pulling a prebuilt wheel). Default empty — most backends ignore it. +ARG FROM_SOURCE="" +ENV FROM_SOURCE=${FROM_SOURCE} + RUN cd /${BACKEND} && PORTABLE_PYTHON=true make # Package GPU libraries into the backend's lib directory diff --git a/backend/index.yaml b/backend/index.yaml index a1f5688a8..d0f75a4ca 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -197,6 +197,7 @@ amd: "rocm-vllm" intel: "intel-vllm" nvidia-cuda-12: "cuda12-vllm" + cpu: "cpu-vllm" - &vllm-omni name: "vllm-omni" license: apache-2.0 @@ -1563,6 +1564,7 @@ nvidia: "cuda12-vllm-development" amd: "rocm-vllm-development" intel: "intel-vllm-development" + cpu: "cpu-vllm-development" - !!merge <<: *vllm name: "cuda12-vllm" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm" @@ -1578,6 +1580,11 @@ uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-vllm" mirrors: - localai/localai-backends:latest-gpu-intel-vllm +- !!merge <<: *vllm + name: "cpu-vllm" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-vllm" + mirrors: + - localai/localai-backends:latest-cpu-vllm - !!merge <<: *vllm name: "cuda12-vllm-development" uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-vllm" @@ -1593,6 +1600,11 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-vllm" mirrors: - localai/localai-backends:master-gpu-intel-vllm +- !!merge <<: *vllm + name: "cpu-vllm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-vllm" + mirrors: + - localai/localai-backends:master-cpu-vllm # vllm-omni - !!merge <<: *vllm-omni name: "vllm-omni-development" diff --git a/backend/python/common/vllm_utils.py b/backend/python/common/vllm_utils.py new file mode 100644 index 000000000..bc0518663 --- /dev/null +++ b/backend/python/common/vllm_utils.py @@ -0,0 +1,84 @@ +"""Shared utilities for vLLM-based backends.""" +import json +import sys + + +def parse_options(options_list): + """Parse Options[] list of 'key:value' strings into a dict. + + Supports type inference for common cases (bool, int, float). + Used by LoadModel to extract backend-specific options. + """ + opts = {} + for opt in options_list: + if ":" not in opt: + continue + key, value = opt.split(":", 1) + key = key.strip() + value = value.strip() + # Try type conversion + if value.lower() in ("true", "false"): + opts[key] = value.lower() == "true" + else: + try: + opts[key] = int(value) + except ValueError: + try: + opts[key] = float(value) + except ValueError: + opts[key] = value + return opts + + +def messages_to_dicts(proto_messages): + """Convert proto Message objects to list of dicts for apply_chat_template(). + + Handles: role, content, name, tool_call_id, reasoning_content, tool_calls (JSON string -> list). + """ + result = [] + for msg in proto_messages: + d = {"role": msg.role, "content": msg.content or ""} + if msg.name: + d["name"] = msg.name + if msg.tool_call_id: + d["tool_call_id"] = msg.tool_call_id + if msg.reasoning_content: + d["reasoning_content"] = msg.reasoning_content + if msg.tool_calls: + try: + d["tool_calls"] = json.loads(msg.tool_calls) + except json.JSONDecodeError: + pass + result.append(d) + return result + + +def setup_parsers(opts): + """Return (tool_parser_cls, reasoning_parser_cls) tuple from opts dict. + + Uses vLLM's native ToolParserManager and ReasoningParserManager. + Returns (None, None) if vLLM is not installed or parsers not available. + """ + tool_parser_cls = None + reasoning_parser_cls = None + + tool_parser_name = opts.get("tool_parser") + reasoning_parser_name = opts.get("reasoning_parser") + + if tool_parser_name: + try: + from vllm.tool_parsers import ToolParserManager + tool_parser_cls = ToolParserManager.get_tool_parser(tool_parser_name) + print(f"[vllm_utils] Loaded tool_parser: {tool_parser_name}", file=sys.stderr) + except Exception as e: + print(f"[vllm_utils] Failed to load tool_parser {tool_parser_name}: {e}", file=sys.stderr) + + if reasoning_parser_name: + try: + from vllm.reasoning import ReasoningParserManager + reasoning_parser_cls = ReasoningParserManager.get_reasoning_parser(reasoning_parser_name) + print(f"[vllm_utils] Loaded reasoning_parser: {reasoning_parser_name}", file=sys.stderr) + except Exception as e: + print(f"[vllm_utils] Failed to load reasoning_parser {reasoning_parser_name}: {e}", file=sys.stderr) + + return tool_parser_cls, reasoning_parser_cls diff --git a/backend/python/vllm-omni/backend.py b/backend/python/vllm-omni/backend.py index 96eb8a111..646af2a2e 100644 --- a/backend/python/vllm-omni/backend.py +++ b/backend/python/vllm-omni/backend.py @@ -17,6 +17,8 @@ import time import os import base64 import io +import json +import gc from PIL import Image import torch @@ -30,6 +32,7 @@ import grpc sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common')) sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common')) from grpc_auth import get_auth_interceptors +from vllm_utils import parse_options, messages_to_dicts, setup_parsers from vllm_omni.entrypoints.omni import Omni @@ -148,23 +151,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): def LoadModel(self, request, context): try: + # CPU detection: if no CUDA, default vLLM target device to CPU. + try: + if not torch.cuda.is_available(): + os.environ.setdefault("VLLM_TARGET_DEVICE", "cpu") + os.environ.setdefault("VLLM_CPU_KVCACHE_SPACE", "4") + except Exception: + pass + print(f"Loading model {request.Model}...", file=sys.stderr) print(f"Request {request}", file=sys.stderr) - # Parse options from request.Options (key:value pairs) - self.options = {} - for opt in request.Options: - if ":" not in opt: - continue - key, value = opt.split(":", 1) - # Convert value to appropriate type - if is_float(value): - value = float(value) - elif is_int(value): - value = int(value) - elif value.lower() in ["true", "false"]: - value = value.lower() == "true" - self.options[key] = value + # Parse options from request.Options using shared helper + self.options = parse_options(request.Options) + opts = self.options print(f"Options: {self.options}", file=sys.stderr) @@ -244,6 +244,24 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): omni_kwargs["max_model_len"] = request.MaxModelLen self.omni = Omni(**omni_kwargs) + + # Load tokenizer for LLM/TTS so chat templates work + if self.model_type in ("llm", "tts"): + try: + from vllm.transformers_utils.tokenizer import get_tokenizer + self.tokenizer = get_tokenizer( + request.Model, + trust_remote_code=opts.get("trust_remote_code", False), + ) + except Exception as e: + print(f"Failed to load tokenizer: {e}", file=sys.stderr) + self.tokenizer = None + else: + self.tokenizer = None + + # Setup optional tool / reasoning parsers + self.tool_parser_cls, self.reasoning_parser_cls = setup_parsers(opts) + print("Model loaded successfully", file=sys.stderr) return backend_pb2.Result(message="Model loaded successfully", success=True) @@ -466,14 +484,32 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # Extract prompt if request.Prompt: prompt = request.Prompt - elif request.Messages and request.UseTokenizerTemplate: - # Build prompt from messages (simplified - would need tokenizer for full template) - prompt = "" - for msg in request.Messages: - role = msg.role - content = msg.content - prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n" - prompt += "<|im_start|>assistant\n" + elif request.Messages: + if getattr(self, "tokenizer", None) is not None: + messages_dicts = messages_to_dicts(request.Messages) + template_kwargs = {"tokenize": False, "add_generation_prompt": True} + if request.Tools: + try: + template_kwargs["tools"] = json.loads(request.Tools) + except json.JSONDecodeError: + pass + try: + if request.Metadata.get("enable_thinking", "").lower() == "true": + template_kwargs["enable_thinking"] = True + except Exception: + pass + try: + prompt = self.tokenizer.apply_chat_template(messages_dicts, **template_kwargs) + except TypeError: + prompt = self.tokenizer.apply_chat_template( + messages_dicts, tokenize=False, add_generation_prompt=True + ) + else: + # Fallback: basic template + prompt = "" + for msg in request.Messages: + prompt += f"<|im_start|>{msg.role}\n{msg.content}<|im_end|>\n" + prompt += "<|im_start|>assistant\n" else: yield backend_pb2.Reply(message=bytes("", 'utf-8')) return @@ -539,20 +575,79 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # Call omni.generate() (returns generator for LLM mode) omni_generator = self.omni.generate([inputs], sampling_params_list) - # Extract text from outputs + # Extract text from outputs and track token usage generated_text = "" + prompt_tokens = 0 + completion_tokens = 0 for stage_outputs in omni_generator: if stage_outputs.final_output_type == "text": for output in stage_outputs.request_output: - text_output = output.outputs[0].text + completion = output.outputs[0] + text_output = completion.text + # Track tokens when available + try: + if getattr(output, "prompt_token_ids", None) is not None: + prompt_tokens = len(output.prompt_token_ids) + if getattr(completion, "token_ids", None) is not None: + completion_tokens = len(completion.token_ids) + except Exception: + pass if streaming: # Remove already sent text (vllm concatenates) delta_text = text_output.removeprefix(generated_text) - yield backend_pb2.Reply(message=bytes(delta_text, encoding='utf-8')) + yield backend_pb2.Reply( + message=bytes(delta_text, encoding='utf-8'), + tokens=completion_tokens, + prompt_tokens=prompt_tokens, + ) generated_text = text_output if not streaming: - yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) + # Build optional ChatDelta with parsed reasoning / tool calls + chat_deltas = [] + content_text = generated_text + reasoning_text = "" + tool_call_deltas = [] + + if self.reasoning_parser_cls is not None: + try: + parser = self.reasoning_parser_cls(self.tokenizer) if self.tokenizer else self.reasoning_parser_cls() + reasoning_text, content_text = parser.extract_reasoning_content(content_text, request=None) + reasoning_text = reasoning_text or "" + content_text = content_text or "" + except Exception as e: + print(f"reasoning_parser failed: {e}", file=sys.stderr) + + if self.tool_parser_cls is not None: + try: + parser = self.tool_parser_cls(self.tokenizer) if self.tokenizer else self.tool_parser_cls() + tool_info = parser.extract_tool_calls(content_text, request=None) + if getattr(tool_info, "tools_called", False): + content_text = tool_info.content or "" + for tc in tool_info.tool_calls or []: + fn = getattr(tc, "function", None) + tool_call_deltas.append(backend_pb2.ToolCallDelta( + index=getattr(tc, "index", 0) or 0, + id=getattr(tc, "id", "") or "", + name=getattr(fn, "name", "") if fn else "", + arguments=getattr(fn, "arguments", "") if fn else "", + )) + except Exception as e: + print(f"tool_parser failed: {e}", file=sys.stderr) + + if self.tool_parser_cls is not None or self.reasoning_parser_cls is not None: + chat_deltas.append(backend_pb2.ChatDelta( + content=content_text, + reasoning_content=reasoning_text, + tool_calls=tool_call_deltas, + )) + + yield backend_pb2.Reply( + message=bytes(generated_text, encoding='utf-8'), + tokens=completion_tokens, + prompt_tokens=prompt_tokens, + chat_deltas=chat_deltas, + ) except Exception as err: print(f"Error in Predict: {err}", file=sys.stderr) @@ -647,6 +742,37 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): traceback.print_exc() return backend_pb2.Result(success=False, message=f"Error generating TTS: {err}") + def TokenizeString(self, request, context): + if not hasattr(self, 'tokenizer') or self.tokenizer is None: + context.set_code(grpc.StatusCode.FAILED_PRECONDITION) + context.set_details("Model/tokenizer not loaded") + return backend_pb2.TokenizationResponse() + try: + tokens = self.tokenizer.encode(request.Prompt) + return backend_pb2.TokenizationResponse(length=len(tokens), tokens=tokens) + except Exception as e: + context.set_code(grpc.StatusCode.INTERNAL) + context.set_details(str(e)) + return backend_pb2.TokenizationResponse() + + def Free(self, request, context): + try: + if hasattr(self, 'omni'): + del self.omni + if hasattr(self, 'tokenizer'): + del self.tokenizer + self.tool_parser_cls = None + self.reasoning_parser_cls = None + gc.collect() + try: + if torch.cuda.is_available(): + torch.cuda.empty_cache() + except Exception: + pass + return backend_pb2.Result(success=True, message="Model freed") + except Exception as e: + return backend_pb2.Result(success=False, message=str(e)) + def serve(address): server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 07323c424..95ae95a9d 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -5,6 +5,9 @@ import argparse import signal import sys import os +import json +import time +import gc from typing import List from PIL import Image @@ -26,6 +29,25 @@ from vllm.assets.video import VideoAsset import base64 import io +# Version-compat imports — wrap in try/except for older vLLM versions +try: + from vllm.tool_parsers import ToolParserManager + HAS_TOOL_PARSERS = True +except ImportError: + HAS_TOOL_PARSERS = False + +try: + from vllm.reasoning import ReasoningParserManager + HAS_REASONING_PARSERS = True +except ImportError: + HAS_REASONING_PARSERS = False + +try: + from vllm.sampling_params import GuidedDecodingParams + HAS_GUIDED_DECODING = True +except ImportError: + HAS_GUIDED_DECODING = False + _ONE_DAY_IN_SECONDS = 60 * 60 * 24 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1 @@ -69,6 +91,35 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): break return decoded_text + def _parse_options(self, options_list): + """Parse Options[] key:value string list into a dict.""" + opts = {} + for opt in options_list: + if ":" not in opt: + continue + key, value = opt.split(":", 1) + opts[key.strip()] = value.strip() + return opts + + def _messages_to_dicts(self, messages): + """Convert proto Messages to list of dicts suitable for apply_chat_template().""" + result = [] + for msg in messages: + d = {"role": msg.role, "content": msg.content or ""} + if msg.name: + d["name"] = msg.name + if msg.tool_call_id: + d["tool_call_id"] = msg.tool_call_id + if msg.reasoning_content: + d["reasoning_content"] = msg.reasoning_content + if msg.tool_calls: + try: + d["tool_calls"] = json.loads(msg.tool_calls) + except json.JSONDecodeError: + pass + result.append(d) + return result + def Health(self, request, context): """ Returns a health check message. @@ -132,15 +183,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") try: - engine_model_config = await self.llm.get_model_config() - self.tokenizer = get_tokenizer( - engine_model_config.tokenizer, - tokenizer_mode=engine_model_config.tokenizer_mode, - trust_remote_code=engine_model_config.trust_remote_code, - truncation_side="left", - ) + # vLLM >= 0.14 removed get_model_config() on AsyncLLM; the tokenizer + # is either already loaded on the engine or can be built from the + # Model name directly. + tokenizer = None + if hasattr(self.llm, "get_tokenizer"): + try: + tokenizer = await self.llm.get_tokenizer() + except TypeError: + tokenizer = self.llm.get_tokenizer() + except Exception: + tokenizer = None + if tokenizer is None and hasattr(self.llm, "tokenizer"): + tokenizer = self.llm.tokenizer + if tokenizer is None: + tokenizer = get_tokenizer( + request.Model, + trust_remote_code=bool(request.TrustRemoteCode), + truncation_side="left", + ) + self.tokenizer = tokenizer except Exception as err: return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + # Parse options for parser selection + opts = self._parse_options(request.Options) + + # Instantiate tool/reasoning parser classes (they'll be instantiated per-request with tokenizer) + self.tool_parser_cls = None + self.reasoning_parser_cls = None + if HAS_TOOL_PARSERS and opts.get("tool_parser"): + try: + self.tool_parser_cls = ToolParserManager.get_tool_parser(opts["tool_parser"]) + print(f"Loaded tool_parser: {opts['tool_parser']}", file=sys.stderr) + except Exception as e: + print(f"Failed to load tool_parser {opts.get('tool_parser')}: {e}", file=sys.stderr) + + if HAS_REASONING_PARSERS and opts.get("reasoning_parser"): + try: + self.reasoning_parser_cls = ReasoningParserManager.get_reasoning_parser(opts["reasoning_parser"]) + print(f"Loaded reasoning_parser: {opts['reasoning_parser']}", file=sys.stderr) + except Exception as e: + print(f"Failed to load reasoning_parser {opts.get('reasoning_parser')}: {e}", file=sys.stderr) + print("Model loaded successfully", file=sys.stderr) return backend_pb2.Result(message="Model loaded successfully", success=True) @@ -197,6 +282,38 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): finally: await iterations.aclose() + async def TokenizeString(self, request, context): + if not hasattr(self, 'tokenizer') or self.tokenizer is None: + context.set_code(grpc.StatusCode.FAILED_PRECONDITION) + context.set_details("Model/tokenizer not loaded") + return backend_pb2.TokenizationResponse() + try: + tokens = self.tokenizer.encode(request.Prompt) + return backend_pb2.TokenizationResponse(length=len(tokens), tokens=tokens) + except Exception as e: + context.set_code(grpc.StatusCode.INTERNAL) + context.set_details(str(e)) + return backend_pb2.TokenizationResponse() + + async def Free(self, request, context): + try: + if hasattr(self, 'llm'): + del self.llm + if hasattr(self, 'tokenizer'): + del self.tokenizer + self.tool_parser_cls = None + self.reasoning_parser_cls = None + gc.collect() + try: + import torch + if torch.cuda.is_available(): + torch.cuda.empty_cache() + except ImportError: + pass + return backend_pb2.Result(success=True, message="Model freed") + except Exception as e: + return backend_pb2.Result(success=False, message=str(e)) + async def _predict(self, request, context, streaming=False): # Build the sampling parameters # NOTE: this must stay in sync with the vllm backend @@ -222,7 +339,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): "SkipSpecialTokens": "skip_special_tokens", "SpacesBetweenSpecialTokens": "spaces_between_special_tokens", "TruncatePromptTokens": "truncate_prompt_tokens", - "GuidedDecoding": "guided_decoding", } sampling_params = SamplingParams(top_p=0.9, max_tokens=200) @@ -233,6 +349,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if value not in (None, 0, [], False): setattr(sampling_params, param_field, value) + # Guided decoding: use Grammar field to pass JSON schema or BNF + if HAS_GUIDED_DECODING and request.Grammar: + try: + json.loads(request.Grammar) # valid JSON = JSON schema + sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar) + except json.JSONDecodeError: + sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar) + # Extract image paths and process images prompt = request.Prompt @@ -244,7 +368,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template if not request.Prompt and request.UseTokenizerTemplate and request.Messages: - prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True) + messages_dicts = self._messages_to_dicts(request.Messages) + template_kwargs = {"tokenize": False, "add_generation_prompt": True} + + # Pass tools for tool calling + if request.Tools: + try: + template_kwargs["tools"] = json.loads(request.Tools) + except json.JSONDecodeError: + pass + + # Enable thinking mode if requested + if request.Metadata.get("enable_thinking", "").lower() == "true": + template_kwargs["enable_thinking"] = True + + try: + prompt = self.tokenizer.apply_chat_template(messages_dicts, **template_kwargs) + except TypeError: + # Some tokenizers don't support tools/enable_thinking kwargs — retry without them + prompt = self.tokenizer.apply_chat_template( + messages_dicts, tokenize=False, add_generation_prompt=True + ) # Generate text using the LLM engine request_id = random_uuid() @@ -265,25 +409,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # Stream the results generated_text = "" + last_output = None try: async for request_output in outputs: iteration_text = request_output.outputs[0].text + last_output = request_output if streaming: # Remove text already sent as vllm concatenates the text from previous yields delta_iteration_text = iteration_text.removeprefix(generated_text) # Send the partial result - yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8')) + yield backend_pb2.Reply( + message=bytes(delta_iteration_text, encoding='utf-8'), + chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)], + ) # Keep track of text generated generated_text = iteration_text finally: await outputs.aclose() - # If streaming, we already sent everything - if streaming: - return - # Remove the image files from /tmp folder for img_path in image_paths: try: @@ -291,8 +436,99 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): except Exception as e: print(f"Error removing image file: {img_path}, {e}", file=sys.stderr) - # Sending the final generated text - yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) + # Parse reasoning and tool calls from final text using vLLM's native parsers + content = generated_text + reasoning_content = "" + tool_calls_proto = [] + + if self.reasoning_parser_cls: + try: + rp = self.reasoning_parser_cls(self.tokenizer) + r, c = rp.extract_reasoning(generated_text, request=None) + reasoning_content = r or "" + content = c if c is not None else generated_text + except Exception as e: + print(f"Reasoning parser error: {e}", file=sys.stderr) + + if self.tool_parser_cls and request.Tools: + try: + tools = json.loads(request.Tools) + # Some concrete parsers only accept the tokenizer; only the + # abstract base declares the tools kwarg. Try with tools first, + # fall back to tokenizer-only. + try: + tp = self.tool_parser_cls(self.tokenizer, tools=tools) + except TypeError: + tp = self.tool_parser_cls(self.tokenizer) + info = tp.extract_tool_calls(content, request=None) + if info.tools_called: + content = info.content or "" + for i, tc in enumerate(info.tool_calls): + tool_calls_proto.append(backend_pb2.ToolCallDelta( + index=i, + id=tc.id, + name=tc.function.name, + arguments=tc.function.arguments, + )) + except Exception as e: + print(f"Tool parser error: {e}", file=sys.stderr) + + # Extract token counts + prompt_tokens = 0 + completion_tokens = 0 + if last_output is not None: + try: + prompt_tokens = len(last_output.prompt_token_ids or []) + except Exception: + pass + try: + completion_tokens = len(last_output.outputs[0].token_ids or []) + except Exception: + pass + + # Extract logprobs if requested + logprobs_bytes = b"" + if last_output is not None and request.Logprobs > 0: + try: + lp = last_output.outputs[0].logprobs + if lp: + logprobs_data = {"content": []} + for token_lp_dict in lp: + if token_lp_dict: + first_tok_id, first_lp = next(iter(token_lp_dict.items())) + logprobs_data["content"].append({ + "token": getattr(first_lp, "decoded_token", str(first_tok_id)), + "logprob": first_lp.logprob, + }) + logprobs_bytes = json.dumps(logprobs_data).encode("utf-8") + except Exception as e: + print(f"Logprobs extraction error: {e}", file=sys.stderr) + + chat_delta = backend_pb2.ChatDelta( + content=content, + reasoning_content=reasoning_content, + tool_calls=tool_calls_proto, + ) + + if streaming: + # Final chunk with structured data + yield backend_pb2.Reply( + message=b"", + prompt_tokens=prompt_tokens, + tokens=completion_tokens, + chat_deltas=[chat_delta], + logprobs=logprobs_bytes, + ) + return + + # Non-streaming: single Reply with everything + yield backend_pb2.Reply( + message=bytes(content, encoding='utf-8'), + prompt_tokens=prompt_tokens, + tokens=completion_tokens, + chat_deltas=[chat_delta], + logprobs=logprobs_bytes, + ) def load_image(self, image_path: str): """ diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 7dcd29db4..cf6fa7efe 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -26,20 +26,43 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" fi -# We don't embed this into the images as it is a large dependency and not always needed. -# Besides, the speed inference are not actually usable in the current state for production use-cases. -if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then - ensureVenv - # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html - if [ ! -d vllm ]; then - git clone https://github.com/vllm-project/vllm - fi - pushd vllm - uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.1 protobuf bitsandbytes - uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu - VLLM_TARGET_DEVICE=cpu python setup.py install - popd - rm -rf vllm - else - installRequirements +# CPU builds need unsafe-best-match to pull torch==2.10.0+cpu from the +# pytorch test channel while still resolving transformers/vllm from pypi. +if [ "x${BUILD_PROFILE}" == "xcpu" ]; then + EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" +fi + +# FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in +# requirements-cpu-after.txt and compiles vllm locally against the host's +# actual CPU. Not used by default because it takes ~30-40 minutes, but +# kept here for hosts where the prebuilt wheel SIGILLs (CPU without the +# required SIMD baseline, e.g. AVX-512 VNNI/BF16). Default CI uses a +# bigger-runner with compatible hardware instead. +if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then + # Temporarily hide the prebuilt wheel so installRequirements doesn't + # pull it — the rest of the requirements files (base deps, torch, + # transformers) are still installed normally. + _cpu_after="${backend_dir}/requirements-cpu-after.txt" + _cpu_after_bak="" + if [ -f "${_cpu_after}" ]; then + _cpu_after_bak="${_cpu_after}.from-source.bak" + mv "${_cpu_after}" "${_cpu_after_bak}" + fi + installRequirements + if [ -n "${_cpu_after_bak}" ]; then + mv "${_cpu_after_bak}" "${_cpu_after}" + fi + + # Build vllm from source against the installed torch. + # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/ + _vllm_src=$(mktemp -d) + trap 'rm -rf "${_vllm_src}"' EXIT + git clone --depth 1 https://github.com/vllm-project/vllm "${_vllm_src}/vllm" + pushd "${_vllm_src}/vllm" + uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm + # Respect pre-installed torch version — skip vllm's own requirements-build.txt torch pin. + VLLM_TARGET_DEVICE=cpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps . + popd +else + installRequirements fi diff --git a/backend/python/vllm/package.sh b/backend/python/vllm/package.sh new file mode 100755 index 000000000..3c4ba8c19 --- /dev/null +++ b/backend/python/vllm/package.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Script to package runtime shared libraries for the vllm backend. +# +# The final Dockerfile.python stage is FROM scratch, so system libraries +# must be explicitly copied into ${BACKEND}/lib so the backend can run on +# any host without installing them. libbackend.sh automatically adds that +# directory to LD_LIBRARY_PATH at run time. +# +# vllm's CPU C++ extension (vllm._C) dlopens libnuma.so.1 at import time; +# if it's missing, the _C_utils torch ops are never registered and the +# engine crashes with AttributeError on init_cpu_threads_env. libgomp is +# used by torch's CPU kernels; on some stripped-down hosts it's also +# absent, so we bundle it too. + +set -e + +CURDIR=$(dirname "$(realpath "$0")") +LIB_DIR="${CURDIR}/lib" +mkdir -p "${LIB_DIR}" + +copy_with_symlinks() { + local soname="$1" + local hit="" + for dir in /usr/lib/x86_64-linux-gnu /usr/lib/aarch64-linux-gnu /lib/x86_64-linux-gnu /lib/aarch64-linux-gnu /usr/lib /lib; do + if [ -e "${dir}/${soname}" ]; then + hit="${dir}/${soname}" + break + fi + done + if [ -z "${hit}" ]; then + echo "warning: ${soname} not found in standard lib paths" >&2 + return 0 + fi + # Follow the symlink to the real file, copy it, then recreate the symlink. + local real + real=$(readlink -f "${hit}") + cp -v "${real}" "${LIB_DIR}/" + local real_base + real_base=$(basename "${real}") + if [ "${real_base}" != "${soname}" ]; then + ln -sf "${real_base}" "${LIB_DIR}/${soname}" + fi +} + +copy_with_symlinks libnuma.so.1 +copy_with_symlinks libgomp.so.1 + +echo "vllm packaging completed successfully" +ls -liah "${LIB_DIR}/" diff --git a/backend/python/vllm/requirements-after.txt b/backend/python/vllm/requirements-after.txt index 76f11f154..b5000e6ca 100644 --- a/backend/python/vllm/requirements-after.txt +++ b/backend/python/vllm/requirements-after.txt @@ -1 +1,2 @@ -vllm \ No newline at end of file +# vllm is installed per-acceleration in requirements-{profile}-after.txt +# (cublas12, hipblas, intel, cpu) diff --git a/backend/python/vllm/requirements-cpu-after.txt b/backend/python/vllm/requirements-cpu-after.txt new file mode 100644 index 000000000..e5e4908f7 --- /dev/null +++ b/backend/python/vllm/requirements-cpu-after.txt @@ -0,0 +1,2 @@ +vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_x86_64.whl ; platform_machine == "x86_64" +vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_aarch64.whl ; platform_machine == "aarch64" diff --git a/backend/python/vllm/requirements-cpu.txt b/backend/python/vllm/requirements-cpu.txt index 16c7cbac5..5eeb8a708 100644 --- a/backend/python/vllm/requirements-cpu.txt +++ b/backend/python/vllm/requirements-cpu.txt @@ -1,3 +1,6 @@ +--extra-index-url https://download.pytorch.org/whl/cpu accelerate -torch==2.7.0 -transformers \ No newline at end of file +torch==2.9.1+cpu +torchvision +torchaudio +transformers diff --git a/backend/python/vllm/requirements-cublas12-after.txt b/backend/python/vllm/requirements-cublas12-after.txt index 9251ba608..cab27c888 100644 --- a/backend/python/vllm/requirements-cublas12-after.txt +++ b/backend/python/vllm/requirements-cublas12-after.txt @@ -1 +1,2 @@ https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.7cxx11abiTRUE-cp310-cp310-linux_x86_64.whl +vllm diff --git a/backend/python/vllm/requirements-hipblas-after.txt b/backend/python/vllm/requirements-hipblas-after.txt new file mode 100644 index 000000000..e7a6c7781 --- /dev/null +++ b/backend/python/vllm/requirements-hipblas-after.txt @@ -0,0 +1 @@ +vllm diff --git a/backend/python/vllm/requirements-intel-after.txt b/backend/python/vllm/requirements-intel-after.txt new file mode 100644 index 000000000..e7a6c7781 --- /dev/null +++ b/backend/python/vllm/requirements-intel-after.txt @@ -0,0 +1 @@ +vllm diff --git a/backend/python/vllm/test.py b/backend/python/vllm/test.py index 827aa71a3..21aaf4cf7 100644 --- a/backend/python/vllm/test.py +++ b/backend/python/vllm/test.py @@ -122,6 +122,89 @@ class TestBackendServicer(unittest.TestCase): self.tearDown() + def test_messages_to_dicts(self): + """ + Tests _messages_to_dicts conversion of proto Messages to dicts. + """ + import sys, os + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + from backend import BackendServicer + servicer = BackendServicer() + msgs = [ + backend_pb2.Message(role="user", content="hello"), + backend_pb2.Message( + role="assistant", + content="", + tool_calls='[{"id":"call_1","type":"function","function":{"name":"foo","arguments":"{}"}}]', + reasoning_content="thinking...", + ), + backend_pb2.Message(role="tool", content="result", name="foo", tool_call_id="call_1"), + ] + result = servicer._messages_to_dicts(msgs) + self.assertEqual(len(result), 3) + self.assertEqual(result[0], {"role": "user", "content": "hello"}) + self.assertEqual(result[1]["reasoning_content"], "thinking...") + self.assertIsInstance(result[1]["tool_calls"], list) + self.assertEqual(result[1]["tool_calls"][0]["id"], "call_1") + self.assertEqual(result[2]["tool_call_id"], "call_1") + self.assertEqual(result[2]["name"], "foo") + + def test_parse_options(self): + """ + Tests _parse_options correctly parses key:value strings. + """ + import sys, os + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + from backend import BackendServicer + servicer = BackendServicer() + opts = servicer._parse_options([ + "tool_parser:hermes", + "reasoning_parser:deepseek_r1", + "invalid_no_colon", + "key_with_colons:a:b:c", + ]) + self.assertEqual(opts["tool_parser"], "hermes") + self.assertEqual(opts["reasoning_parser"], "deepseek_r1") + self.assertEqual(opts["key_with_colons"], "a:b:c") + self.assertNotIn("invalid_no_colon", opts) + + def test_tokenize_string(self): + """ + Tests the TokenizeString RPC returns valid tokens. + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) + self.assertTrue(response.success) + resp = stub.TokenizeString(backend_pb2.PredictOptions(Prompt="Hello world")) + self.assertGreater(resp.length, 0) + self.assertEqual(len(resp.tokens), resp.length) + except Exception as err: + print(err) + self.fail("TokenizeString service failed") + finally: + self.tearDown() + + def test_free(self): + """ + Tests the Free RPC doesn't crash. + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) + self.assertTrue(response.success) + free_resp = stub.Free(backend_pb2.HealthMessage()) + self.assertTrue(free_resp.success) + except Exception as err: + print(err) + self.fail("Free service failed") + finally: + self.tearDown() + def test_embedding(self): """ This method tests if the embeddings are generated successfully diff --git a/core/config/backend_hooks.go b/core/config/backend_hooks.go new file mode 100644 index 000000000..8b2403cbb --- /dev/null +++ b/core/config/backend_hooks.go @@ -0,0 +1,30 @@ +package config + +// BackendDefaultsHook is called during Prepare() and can modify cfg. +// Only fills in values that are not already set by the user. +type BackendDefaultsHook func(cfg *ModelConfig, modelPath string) + +var backendHooks = map[string][]BackendDefaultsHook{} + +// RegisterBackendHook registers a hook for a backend name. +// Special keys: +// - "*" = global catch-all, runs for EVERY backend (before specific hooks) +// - "" = runs only when cfg.Backend is empty (auto-detect case) +// - "vllm", "llama-cpp" etc. = runs only for that specific backend +// +// Multiple hooks per key are supported; they run in registration order. +func RegisterBackendHook(backend string, hook BackendDefaultsHook) { + backendHooks[backend] = append(backendHooks[backend], hook) +} + +// runBackendHooks executes hooks in order: +// 1. "*" (global) hooks for every backend +// 2. Backend-specific hooks for cfg.Backend (includes "" when backend is empty) +func runBackendHooks(cfg *ModelConfig, modelPath string) { + for _, h := range backendHooks["*"] { + h(cfg, modelPath) + } + for _, h := range backendHooks[cfg.Backend] { + h(cfg, modelPath) + } +} diff --git a/core/config/guesser.go b/core/config/guesser.go deleted file mode 100644 index e4ca5b141..000000000 --- a/core/config/guesser.go +++ /dev/null @@ -1,46 +0,0 @@ -package config - -import ( - "os" - "path/filepath" - - gguf "github.com/gpustack/gguf-parser-go" - "github.com/mudler/xlog" -) - -func guessDefaultsFromFile(cfg *ModelConfig, modelPath string, defaultCtx int) { - if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" { - xlog.Debug("guessDefaultsFromFile: guessing disabled with LOCALAI_DISABLE_GUESSING") - return - } - - if modelPath == "" { - xlog.Debug("guessDefaultsFromFile: modelPath is empty") - return - } - - // We try to guess only if we don't have a template defined already - guessPath := filepath.Join(modelPath, cfg.ModelFileName()) - - defer func() { - if r := recover(); r != nil { - xlog.Error("guessDefaultsFromFile: panic while parsing gguf file") - } - }() - - defer func() { - if cfg.ContextSize == nil { - if defaultCtx == 0 { - defaultCtx = defaultContextSize - } - cfg.ContextSize = &defaultCtx - } - }() - - // try to parse the gguf file - f, err := gguf.ParseGGUFFile(guessPath) - if err == nil { - guessGGUFFromFile(cfg, f, defaultCtx) - return - } -} diff --git a/core/config/hooks_llamacpp.go b/core/config/hooks_llamacpp.go new file mode 100644 index 000000000..7c2640cee --- /dev/null +++ b/core/config/hooks_llamacpp.go @@ -0,0 +1,46 @@ +package config + +import ( + "os" + "path/filepath" + + gguf "github.com/gpustack/gguf-parser-go" + "github.com/mudler/xlog" +) + +func init() { + // Register for both explicit llama-cpp and empty backend (auto-detect from GGUF file) + RegisterBackendHook("llama-cpp", llamaCppDefaults) + RegisterBackendHook("", llamaCppDefaults) +} + +func llamaCppDefaults(cfg *ModelConfig, modelPath string) { + if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" { + xlog.Debug("llamaCppDefaults: guessing disabled") + return + } + if modelPath == "" { + return + } + + guessPath := filepath.Join(modelPath, cfg.ModelFileName()) + + defer func() { + if r := recover(); r != nil { + xlog.Error("llamaCppDefaults: panic while parsing gguf file") + } + }() + + // Default context size if not set, regardless of whether GGUF parsing succeeds + defer func() { + if cfg.ContextSize == nil { + ctx := defaultContextSize + cfg.ContextSize = &ctx + } + }() + + f, err := gguf.ParseGGUFFile(guessPath) + if err == nil { + guessGGUFFromFile(cfg, f, 0) + } +} diff --git a/core/config/hooks_test.go b/core/config/hooks_test.go new file mode 100644 index 000000000..b97077564 --- /dev/null +++ b/core/config/hooks_test.go @@ -0,0 +1,114 @@ +package config_test + +import ( + . "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/schema" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Backend hooks and parser defaults", func() { + Context("MatchParserDefaults", func() { + It("matches Qwen3 family", func() { + parsers := MatchParserDefaults("Qwen/Qwen3-8B") + Expect(parsers).NotTo(BeNil()) + Expect(parsers["tool_parser"]).To(Equal("hermes")) + Expect(parsers["reasoning_parser"]).To(Equal("qwen3")) + }) + + It("matches Qwen3.5 with longest-prefix-first", func() { + parsers := MatchParserDefaults("Qwen/Qwen3.5-9B") + Expect(parsers).NotTo(BeNil()) + Expect(parsers["tool_parser"]).To(Equal("qwen3_xml")) + }) + + It("matches Llama-3.3 not Llama-3.2", func() { + parsers := MatchParserDefaults("meta/Llama-3.3-70B-Instruct") + Expect(parsers).NotTo(BeNil()) + Expect(parsers["tool_parser"]).To(Equal("llama3_json")) + }) + + It("matches deepseek-r1", func() { + parsers := MatchParserDefaults("deepseek-ai/DeepSeek-R1") + Expect(parsers).NotTo(BeNil()) + Expect(parsers["reasoning_parser"]).To(Equal("deepseek_r1")) + Expect(parsers["tool_parser"]).To(Equal("deepseek_v3")) + }) + + It("returns nil for unknown families", func() { + Expect(MatchParserDefaults("acme/unknown-model-xyz")).To(BeNil()) + }) + }) + + Context("Backend hook registration and execution", func() { + It("runs registered hook for a backend", func() { + called := false + RegisterBackendHook("test-backend-hook", func(cfg *ModelConfig, modelPath string) { + called = true + cfg.Description = "modified-by-hook" + }) + + cfg := &ModelConfig{ + Backend: "test-backend-hook", + } + // Use the public Prepare path indirectly is heavy; instead exercise via vllmDefaults + // path, but here just call RegisterBackendHook + we know runBackendHooks is internal. + // Verify by leveraging Prepare on a fresh ModelConfig with no model path. + cfg.PredictionOptions = schema.PredictionOptions{} + + // Trigger via Prepare with empty options; this calls runBackendHooks internally. + cfg.SetDefaults() + Expect(called).To(BeTrue()) + Expect(cfg.Description).To(Equal("modified-by-hook")) + }) + }) + + Context("vllmDefaults hook", func() { + It("auto-sets parsers for known model families on vllm backend", func() { + cfg := &ModelConfig{ + Backend: "vllm", + PredictionOptions: schema.PredictionOptions{ + BasicModelRequest: schema.BasicModelRequest{ + Model: "Qwen/Qwen3-8B", + }, + }, + } + cfg.SetDefaults() + + foundTool := false + foundReasoning := false + for _, opt := range cfg.Options { + if opt == "tool_parser:hermes" { + foundTool = true + } + if opt == "reasoning_parser:qwen3" { + foundReasoning = true + } + } + Expect(foundTool).To(BeTrue()) + Expect(foundReasoning).To(BeTrue()) + }) + + It("does not override user-set tool_parser", func() { + cfg := &ModelConfig{ + Backend: "vllm", + Options: []string{"tool_parser:custom"}, + PredictionOptions: schema.PredictionOptions{ + BasicModelRequest: schema.BasicModelRequest{ + Model: "Qwen/Qwen3-8B", + }, + }, + } + cfg.SetDefaults() + + count := 0 + for _, opt := range cfg.Options { + if len(opt) >= len("tool_parser:") && opt[:len("tool_parser:")] == "tool_parser:" { + count++ + } + } + Expect(count).To(Equal(1)) + }) + }) +}) diff --git a/core/config/hooks_vllm.go b/core/config/hooks_vllm.go new file mode 100644 index 000000000..3f7abd9b3 --- /dev/null +++ b/core/config/hooks_vllm.go @@ -0,0 +1,85 @@ +package config + +import ( + _ "embed" + "encoding/json" + "strings" + + "github.com/mudler/xlog" +) + +//go:embed parser_defaults.json +var parserDefaultsJSON []byte + +type parserDefaultsData struct { + Families map[string]map[string]string `json:"families"` + Patterns []string `json:"patterns"` +} + +var parsersData *parserDefaultsData + +func init() { + parsersData = &parserDefaultsData{} + if err := json.Unmarshal(parserDefaultsJSON, parsersData); err != nil { + xlog.Warn("failed to parse parser_defaults.json", "error", err) + } + + RegisterBackendHook("vllm", vllmDefaults) + RegisterBackendHook("vllm-omni", vllmDefaults) +} + +// MatchParserDefaults returns parser defaults for the best-matching model family. +// Returns nil if no family matches. Used both at load time (via hook) and at import time. +func MatchParserDefaults(modelID string) map[string]string { + if parsersData == nil || len(parsersData.Patterns) == 0 { + return nil + } + normalized := normalizeModelID(modelID) + for _, pattern := range parsersData.Patterns { + if strings.Contains(normalized, pattern) { + if family, ok := parsersData.Families[pattern]; ok { + return family + } + } + } + return nil +} + +func vllmDefaults(cfg *ModelConfig, modelPath string) { + // Check if user already set tool_parser or reasoning_parser in Options + hasToolParser := false + hasReasoningParser := false + for _, opt := range cfg.Options { + if strings.HasPrefix(opt, "tool_parser:") { + hasToolParser = true + } + if strings.HasPrefix(opt, "reasoning_parser:") { + hasReasoningParser = true + } + } + if hasToolParser && hasReasoningParser { + return + } + + // Try matching against Model field, then Name + parsers := MatchParserDefaults(cfg.Model) + if parsers == nil { + parsers = MatchParserDefaults(cfg.Name) + } + if parsers == nil { + return + } + + if !hasToolParser { + if tp, ok := parsers["tool_parser"]; ok { + cfg.Options = append(cfg.Options, "tool_parser:"+tp) + xlog.Debug("[parser_defaults] auto-set tool_parser", "parser", tp, "model", cfg.Model) + } + } + if !hasReasoningParser { + if rp, ok := parsers["reasoning_parser"]; ok { + cfg.Options = append(cfg.Options, "reasoning_parser:"+rp) + xlog.Debug("[parser_defaults] auto-set reasoning_parser", "parser", rp, "model", cfg.Model) + } + } +} diff --git a/core/config/model_config.go b/core/config/model_config.go index 5f1780b76..4185d4f3f 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -497,7 +497,12 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) { cfg.Debug = &trueV } - guessDefaultsFromFile(cfg, lo.modelPath, ctx) + // If a context size was provided via LoadOptions, apply it before hooks so they + // don't override it with their own defaults. + if ctx != 0 && cfg.ContextSize == nil { + cfg.ContextSize = &ctx + } + runBackendHooks(cfg, lo.modelPath) cfg.syncKnownUsecasesFromString() } diff --git a/core/config/parser_defaults.json b/core/config/parser_defaults.json new file mode 100644 index 000000000..614e6df1e --- /dev/null +++ b/core/config/parser_defaults.json @@ -0,0 +1,33 @@ +{ + "families": { + "qwen3.5": {"tool_parser": "qwen3_xml", "reasoning_parser": "qwen3"}, + "qwen3-coder": {"tool_parser": "qwen3_xml", "reasoning_parser": "qwen3"}, + "qwen3": {"tool_parser": "hermes", "reasoning_parser": "qwen3"}, + "qwen2.5": {"tool_parser": "hermes"}, + "qwq": {"reasoning_parser": "deepseek_r1"}, + "llama-4": {"tool_parser": "llama4_pythonic"}, + "llama-3.3": {"tool_parser": "llama3_json"}, + "llama-3.2": {"tool_parser": "llama3_json"}, + "llama-3.1": {"tool_parser": "llama3_json"}, + "mistral-nemo": {"tool_parser": "mistral", "reasoning_parser": "mistral"}, + "mistral-small": {"tool_parser": "mistral", "reasoning_parser": "mistral"}, + "mistral-large": {"tool_parser": "mistral", "reasoning_parser": "mistral"}, + "magistral": {"tool_parser": "mistral", "reasoning_parser": "mistral"}, + "deepseek-r1": {"tool_parser": "deepseek_v3", "reasoning_parser": "deepseek_r1"}, + "deepseek-v3": {"tool_parser": "deepseek_v3", "reasoning_parser": "deepseek_v3"}, + "glm-5": {"tool_parser": "glm47"}, + "glm-4": {"tool_parser": "glm45", "reasoning_parser": "glm45"}, + "gemma-4": {"tool_parser": "gemma4", "reasoning_parser": "gemma4"}, + "granite-4": {"tool_parser": "granite4", "reasoning_parser": "granite"}, + "minimax-m2.5": {"tool_parser": "minimax_m2", "reasoning_parser": "minimax_m2"}, + "minimax": {"tool_parser": "minimax_m2", "reasoning_parser": "minimax_m2"}, + "kimi-k2": {"tool_parser": "kimi_k2", "reasoning_parser": "kimi_k2"}, + "nemotron": {"reasoning_parser": "nemotron_v3"}, + "olmo": {"tool_parser": "olmo3", "reasoning_parser": "olmo3"}, + "ernie": {"tool_parser": "ernie45", "reasoning_parser": "ernie45"}, + "phi-4": {"tool_parser": "phi4_mini_json"}, + "gpt-oss": {"tool_parser": "openai", "reasoning_parser": "openai_gptoss"}, + "hermes": {"tool_parser": "hermes"} + }, + "patterns": ["qwen3.5","qwen3-coder","qwen3","qwen2.5","qwq","llama-4","llama-3.3","llama-3.2","llama-3.1","mistral-nemo","mistral-small","mistral-large","magistral","deepseek-r1","deepseek-v3","glm-5","glm-4","gemma-4","granite-4","minimax-m2.5","minimax","kimi-k2","nemotron","olmo","ernie","phi-4","gpt-oss","hermes"] +} diff --git a/core/gallery/importers/vllm.go b/core/gallery/importers/vllm.go index 88baef1fe..886405169 100644 --- a/core/gallery/importers/vllm.go +++ b/core/gallery/importers/vllm.go @@ -88,6 +88,18 @@ func (i *VLLMImporter) Import(details Details) (gallery.ModelConfig, error) { // Apply per-model-family inference parameter defaults config.ApplyInferenceDefaults(&modelConfig, details.URI) + // Auto-detect tool_parser and reasoning_parser for known model families. + // Surfacing them in the generated YAML lets users see and edit the choices. + parsers := config.MatchParserDefaults(details.URI) + if parsers != nil { + if tp, ok := parsers["tool_parser"]; ok { + modelConfig.Options = append(modelConfig.Options, "tool_parser:"+tp) + } + if rp, ok := parsers["reasoning_parser"]; ok { + modelConfig.Options = append(modelConfig.Options, "reasoning_parser:"+rp) + } + } + data, err := yaml.Marshal(modelConfig) if err != nil { return gallery.ModelConfig{}, err diff --git a/core/schema/message.go b/core/schema/message.go index 79a30352e..24407165e 100644 --- a/core/schema/message.go +++ b/core/schema/message.go @@ -83,8 +83,12 @@ func (messages Messages) ToProto() []*proto.Message { } } - // Note: tool_call_id is not in schema.Message yet - // Reasoning field is now available in schema.Message but not yet in proto.Message + if message.ToolCallID != "" { + protoMessages[i].ToolCallId = message.ToolCallID + } + if message.Reasoning != nil { + protoMessages[i].ReasoningContent = *message.Reasoning + } } return protoMessages } diff --git a/core/schema/message_test.go b/core/schema/message_test.go index cd6f514e2..8ebf3fa05 100644 --- a/core/schema/message_test.go +++ b/core/schema/message_test.go @@ -237,6 +237,24 @@ var _ = Describe("LLM tests", func() { Expect(protoMessages[0].Content).To(Equal("")) }) + It("should serialize ToolCallID and Reasoning fields", func() { + reasoning := "thinking..." + messages := Messages{ + { + Role: "tool", + Content: "result", + ToolCallID: "call_123", + Reasoning: &reasoning, + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].ToolCallId).To(Equal("call_123")) + Expect(protoMessages[0].ReasoningContent).To(Equal("thinking...")) + }) + It("should handle message with array content containing non-text parts", func() { messages := Messages{ { diff --git a/tests/e2e-backends/backend_test.go b/tests/e2e-backends/backend_test.go index a800a7ab5..b6f59fd28 100644 --- a/tests/e2e-backends/backend_test.go +++ b/tests/e2e-backends/backend_test.go @@ -29,18 +29,30 @@ import ( // // BACKEND_TEST_MODEL_URL HTTP(S) URL of a model file to download before the test. // BACKEND_TEST_MODEL_FILE Path to an already-available model file (skips download). +// BACKEND_TEST_MODEL_NAME HuggingFace model id (e.g. "Qwen/Qwen2.5-0.5B-Instruct"). +// Passed verbatim as ModelOptions.Model; backends like vllm +// resolve it themselves and no local file is downloaded. // // Optional: // // BACKEND_TEST_CAPS Comma-separated list of capabilities to exercise. -// Supported values: health, load, predict, stream, embeddings. +// Supported values: health, load, predict, stream, +// embeddings, tools. // Defaults to "health,load,predict,stream". // A backend that only does embeddings would set this to // "health,load,embeddings"; an image/TTS backend that cannot // be driven by a text prompt can set it to "health,load". +// "tools" asks the backend to extract a tool call from the +// model output into ChatDelta.tool_calls. // BACKEND_TEST_PROMPT Override the prompt used by predict/stream specs. // BACKEND_TEST_CTX_SIZE Override the context size passed to LoadModel (default 512). // BACKEND_TEST_THREADS Override Threads passed to LoadModel (default 4). +// BACKEND_TEST_OPTIONS Comma-separated Options[] entries passed to LoadModel, +// e.g. "tool_parser:hermes,reasoning_parser:qwen3". +// BACKEND_TEST_TOOL_PROMPT Override the user prompt for the tools spec +// (default: "What's the weather like in Paris, France?"). +// BACKEND_TEST_TOOL_NAME Override the function name expected in the tool call +// (default: "get_weather"). // // The suite is intentionally model-format-agnostic: it only ever passes the // file path to LoadModel, so GGUF, ONNX, safetensors, .bin etc. all work so @@ -51,9 +63,12 @@ const ( capPredict = "predict" capStream = "stream" capEmbeddings = "embeddings" + capTools = "tools" - defaultPrompt = "The capital of France is" - streamPrompt = "Once upon a time" + defaultPrompt = "The capital of France is" + streamPrompt = "Once upon a time" + defaultToolPrompt = "What's the weather like in Paris, France?" + defaultToolName = "get_weather" ) func defaultCaps() map[string]bool { @@ -87,12 +102,14 @@ var _ = Describe("Backend container", Ordered, func() { caps map[string]bool workDir string binaryDir string - modelFile string + modelFile string // set when a local file is used + modelName string // set when a HuggingFace model id is used addr string serverCmd *exec.Cmd conn *grpc.ClientConn client pb.BackendClient prompt string + options []string ) BeforeAll(func() { @@ -101,8 +118,9 @@ var _ = Describe("Backend container", Ordered, func() { modelURL := os.Getenv("BACKEND_TEST_MODEL_URL") modelFile = os.Getenv("BACKEND_TEST_MODEL_FILE") - Expect(modelURL != "" || modelFile != "").To(BeTrue(), - "one of BACKEND_TEST_MODEL_URL or BACKEND_TEST_MODEL_FILE must be set") + modelName = os.Getenv("BACKEND_TEST_MODEL_NAME") + Expect(modelURL != "" || modelFile != "" || modelName != "").To(BeTrue(), + "one of BACKEND_TEST_MODEL_URL, BACKEND_TEST_MODEL_FILE, or BACKEND_TEST_MODEL_NAME must be set") caps = parseCaps() GinkgoWriter.Printf("Testing image=%q with capabilities=%v\n", image, keys(caps)) @@ -112,6 +130,15 @@ var _ = Describe("Backend container", Ordered, func() { prompt = defaultPrompt } + if raw := strings.TrimSpace(os.Getenv("BACKEND_TEST_OPTIONS")); raw != "" { + for _, opt := range strings.Split(raw, ",") { + opt = strings.TrimSpace(opt) + if opt != "" { + options = append(options, opt) + } + } + } + var err error workDir, err = os.MkdirTemp("", "backend-e2e-*") Expect(err).NotTo(HaveOccurred()) @@ -122,8 +149,8 @@ var _ = Describe("Backend container", Ordered, func() { extractImage(image, binaryDir) Expect(filepath.Join(binaryDir, "run.sh")).To(BeAnExistingFile()) - // Download the model once if not provided. - if modelFile == "" { + // Download the model once if not provided and no HF name given. + if modelFile == "" && modelName == "" { modelFile = filepath.Join(workDir, "model.bin") downloadFile(modelURL, modelFile) } @@ -196,16 +223,27 @@ var _ = Describe("Backend container", Ordered, func() { ctxSize := envInt32("BACKEND_TEST_CTX_SIZE", 512) threads := envInt32("BACKEND_TEST_THREADS", 4) - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + // Prefer a HuggingFace model id when provided (e.g. for vllm); + // otherwise fall back to a downloaded/local file path. + modelRef := modelFile + var modelPath string + if modelName != "" { + modelRef = modelName + } else { + modelPath = modelFile + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) defer cancel() res, err := client.LoadModel(ctx, &pb.ModelOptions{ - Model: modelFile, - ModelFile: modelFile, + Model: modelRef, + ModelFile: modelPath, ContextSize: ctxSize, Threads: threads, NGPULayers: 0, MMap: true, NBatch: 128, + Options: options, }) Expect(err).NotTo(HaveOccurred()) Expect(res.GetSuccess()).To(BeTrue(), "LoadModel failed: %s", res.GetMessage()) @@ -275,6 +313,78 @@ var _ = Describe("Backend container", Ordered, func() { Expect(res.GetEmbeddings()).NotTo(BeEmpty(), "Embedding returned empty vector") GinkgoWriter.Printf("Embedding: %d dims\n", len(res.GetEmbeddings())) }) + + It("extracts tool calls into ChatDelta", func() { + if !caps[capTools] { + Skip("tools capability not enabled") + } + + toolPrompt := os.Getenv("BACKEND_TEST_TOOL_PROMPT") + if toolPrompt == "" { + toolPrompt = defaultToolPrompt + } + toolName := os.Getenv("BACKEND_TEST_TOOL_NAME") + if toolName == "" { + toolName = defaultToolName + } + + toolsJSON := fmt.Sprintf(`[{ + "type": "function", + "function": { + "name": %q, + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + } + }, + "required": ["location"] + } + } + }]`, toolName) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + res, err := client.Predict(ctx, &pb.PredictOptions{ + Messages: []*pb.Message{ + {Role: "system", Content: "You are a helpful assistant. Use the provided tool when the user asks about weather."}, + {Role: "user", Content: toolPrompt}, + }, + Tools: toolsJSON, + ToolChoice: "auto", + UseTokenizerTemplate: true, + Tokens: 200, + Temperature: 0.1, + }) + Expect(err).NotTo(HaveOccurred()) + + // Collect tool calls from every delta — some backends emit a single + // final delta, others stream incremental pieces in one Reply. + var toolCalls []*pb.ToolCallDelta + for _, delta := range res.GetChatDeltas() { + toolCalls = append(toolCalls, delta.GetToolCalls()...) + } + + GinkgoWriter.Printf("Tool call: raw=%q deltas=%d tool_calls=%d\n", + string(res.GetMessage()), len(res.GetChatDeltas()), len(toolCalls)) + + Expect(toolCalls).NotTo(BeEmpty(), + "Predict did not return any ToolCallDelta. raw=%q", string(res.GetMessage())) + + matched := false + for _, tc := range toolCalls { + GinkgoWriter.Printf(" - idx=%d id=%q name=%q args=%q\n", + tc.GetIndex(), tc.GetId(), tc.GetName(), tc.GetArguments()) + if tc.GetName() == toolName { + matched = true + } + } + Expect(matched).To(BeTrue(), + "Expected a tool call named %q in ChatDelta.tool_calls", toolName) + }) }) // extractImage runs `docker create` + `docker export` to materialise the image