From 3447b28bbd7ab9b073d8fb25961831b272fc921f Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 24 Jun 2026 17:17:50 +0000 Subject: [PATCH] feat(vllm): macOS/Metal support via vllm-metal (MLX) Add an additive Apple-Silicon path to the existing vllm Python backend so vLLM runs on macOS via vllm-metal (github.com/vllm-project/vllm-metal). Spike outcome (proven on a real M4 / macOS 26.5, Qwen3-0.6B): - vllm-metal registers through vLLM's platform-plugin entry point (metal -> vllm_metal:register); MetalPlatform activates and runs on the GPU through MLX. - LocalAI's backend.py is UNCHANGED: AsyncEngineArgs(...) -> AsyncLLMEngine.from_engine_args transparently resolves to vLLM 0.23's v1 AsyncLLM MLX engine, and async generate produced correct output. - backend.py is NOT touched: its only empty_cache() call is CUDA-only (guarded by torch.cuda.is_available()), so the benign shutdown-only "Allocator for mps is not a DeviceAllocator" noise comes from vLLM's internal EngineCore teardown, not from our code. Changes (all gated behind a darwin condition; Linux/CUDA/ROCm/Intel paths are byte-for-byte unchanged): - install.sh: darwin branch forces PYTHON_VERSION=3.12 (vllm-metal requirement), creates/activates LocalAI's managed venv via ensureVenv, then reproduces vllm-metal's installer INTO that venv (build vLLM 0.23.0 from the release source tarball against requirements/cpu.txt, then install the prebuilt vllm-metal wheel from its latest GitHub release), and runs runProtogen. installRequirements is skipped on darwin. - backend-matrix.yml: add a vllm includeDarwin entry (mps, python). - index.yaml: add metal capability + concrete metal-vllm / metal-vllm-development child entries mirroring the metal-kitten-tts template. Version coupling: vllm-metal pins vLLM 0.23.0, equal to LocalAI's current vllm pin. Bumping vllm must be coordinated with a supporting vllm-metal release; documented in install.sh and requirements-cublas13-after.txt. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:opus-4.8 [Claude Code] --- .github/backend-matrix.yml | 7 ++ backend/index.yaml | 12 +++ backend/python/vllm/install.sh | 85 ++++++++++++++++++- .../vllm/requirements-cublas13-after.txt | 3 + 4 files changed, 106 insertions(+), 1 deletion(-) diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index 593e44cde..1087b9030 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -4974,6 +4974,13 @@ includeDarwin: - backend: "kitten-tts" tag-suffix: "-metal-darwin-arm64-kitten-tts" build-type: "mps" + # vLLM on Apple Silicon via vllm-metal (MLX). The install is custom + # (backend/python/vllm/install.sh has a darwin branch); lang stays python so + # backend_build_darwin.yml drives it through build-darwin-python-backend -> + # scripts/build/python-darwin.sh, which runs the backend's install.sh. + - backend: "vllm" + tag-suffix: "-metal-darwin-arm64-vllm" + build-type: "mps" - backend: "piper" tag-suffix: "-metal-darwin-arm64-piper" build-type: "metal" diff --git a/backend/index.yaml b/backend/index.yaml index 3f61f7b4e..38d443e16 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -645,6 +645,7 @@ nvidia-cuda-13: "cuda13-vllm" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm" cpu: "cpu-vllm" + metal: "metal-vllm" - &sglang name: "sglang" license: apache-2.0 @@ -2927,6 +2928,17 @@ nvidia-cuda-13: "cuda13-vllm-development" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-development" cpu: "cpu-vllm-development" + metal: "metal-vllm-development" +- !!merge <<: *vllm + name: "metal-vllm" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-vllm" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-vllm +- !!merge <<: *vllm + name: "metal-vllm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-vllm" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-vllm - !!merge <<: *vllm name: "cuda12-vllm" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm" diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 320ef6772..2b2e74c36 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -43,6 +43,24 @@ if [ "x${BUILD_PROFILE}" == "xcublas13" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi +# Apple Silicon (Metal/MLX) via vllm-metal. +# vllm-metal (github.com/vllm-project/vllm-metal) brings vLLM to macOS on Apple +# Silicon: it registers through vLLM's platform-plugin entry point +# (metal -> vllm_metal:register), MetalPlatform activates, and the vLLM v1 +# AsyncLLM engine runs on the GPU through MLX. LocalAI's backend.py is UNCHANGED +# on darwin — AsyncEngineArgs(...) -> AsyncLLMEngine.from_engine_args transparently +# resolves to the MLX engine (proven on a real M4 / macOS 26.5 against Qwen3-0.6B). +# +# vllm-metal REQUIRES Python 3.12, so force the portable CPython before the venv +# is created (ensureVenv reads PYTHON_VERSION/PYTHON_PATCH/PY_STANDALONE_TAG). +# The patch + standalone tag mirror the l4t13 cp312 pin — a known-good +# python-build-standalone release that also ships an aarch64-apple-darwin asset. +if [ "$(uname -s)" = "Darwin" ]; then + PYTHON_VERSION="3.12" + PYTHON_PATCH="12" + PY_STANDALONE_TAG="20251120" +fi + # JetPack 7 / L4T arm64 vllm + torch wheels come straight from PyPI now # (torch 2.11+ ships aarch64 + cu130 manylinux wheels and vllm 0.20+ ships # an aarch64 wheel pinned to that torch). They're cp312-only, so bump the @@ -57,11 +75,76 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then PY_STANDALONE_TAG="20251120" fi +# ===================== Apple Silicon (Metal/MLX) ===================== +# Reproduce vllm-metal's upstream installer +# (curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm-metal/main/install.sh) +# but INTO LocalAI's managed venv (ensureVenv) instead of a throwaway +# ~/.venv-vllm-metal, so the backend integrates with LocalAI's venv lifecycle +# (portable CPython, _makeVenvPortable relocation, runtime activation). The +# normal CUDA/CPU installRequirements is skipped on darwin — there is no +# macOS/arm64 vLLM wheel on PyPI; vLLM is built from source and the MLX engine +# is layered on by the vllm-metal wheel. +if [ "$(uname -s)" = "Darwin" ]; then + # Create/activate the portable 3.12 venv. On darwin USE_PIP=true and + # PORTABLE_PYTHON=true (set by scripts/build/python-darwin.sh), so this is a + # `python -m venv` based, relocatable venv. + ensureVenv + + # vllm-metal's installer drives everything through `uv`: building vLLM from + # the CPU requirements needs `--index-strategy unsafe-best-match` (mixes the + # pytorch CPU channel with PyPI), a flag plain pip does not have. The darwin + # venv is pip-based, so bootstrap uv into it. uv honours $VIRTUAL_ENV (set by + # libbackend's _activateVenv) and installs into THIS venv — same pattern the + # intel branch below relies on. + pip install uv + + # VERSION COUPLING (read before bumping vLLM!): vllm-metal pins this exact + # vLLM version and builds against its source tarball. It equals LocalAI's + # current vllm pin (see requirements-cublas13-after.txt: vllm==0.23.0). A + # vLLM bump on Linux MUST be coordinated with a vllm-metal release that + # supports the new version, or darwin builds will break. + VLLM_VERSION="0.23.0" + + _vllm_src=$(mktemp -d) + trap 'rm -rf "${_vllm_src}"' EXIT + pushd "${_vllm_src}" + # 1) Build vLLM ${VLLM_VERSION} from the release source tarball against + # the CPU requirements. vllm-metal layers its MLX platform plugin on + # top of this exact build. + curl -fsSL -o "vllm-${VLLM_VERSION}.tar.gz" \ + "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}.tar.gz" + tar -xzf "vllm-${VLLM_VERSION}.tar.gz" + pushd "vllm-${VLLM_VERSION}" + uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match + # -Wno-parentheses: clang on macOS treats one of vLLM's C++ warnings + # as an error without it (matches the upstream installer's CXXFLAGS). + CXXFLAGS="-Wno-parentheses" uv pip install . + popd + popd + + # 2) Install the prebuilt vllm-metal wheel from its latest GitHub release. + # It pulls mlx / mlx-metal as deps and registers the `metal` platform + # plugin that backend.py resolves to at engine-init time. + _metal_wheel_url=$(curl -fsSL https://api.github.com/repos/vllm-project/vllm-metal/releases/latest \ + | grep -oE '"browser_download_url"[[:space:]]*:[[:space:]]*"[^"]+\.whl"' \ + | head -n1 | sed -E 's/.*"(https[^"]+)".*/\1/') + if [ -z "${_metal_wheel_url}" ]; then + echo "ERROR: could not resolve a vllm-metal wheel URL from the latest GitHub release" >&2 + exit 1 + fi + echo "Installing vllm-metal wheel: ${_metal_wheel_url}" + uv pip install "${_metal_wheel_url}" + + # Generate the gRPC stubs (backend_pb2*). installRequirements normally does + # this via runProtogen at the end; we skipped installRequirements on darwin, + # so call it explicitly here. + runProtogen + # Intel XPU has no upstream-published vllm wheels, so we always build vllm # from source against torch-xpu and replace the default triton with # triton-xpu (matching torch 2.11). Mirrors the upstream procedure: # https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md -if [ "x${BUILD_TYPE}" == "xintel" ]; then +elif [ "x${BUILD_TYPE}" == "xintel" ]; then # Hide requirements-intel-after.txt so installRequirements doesn't # try `pip install vllm` (would either fail or grab a non-XPU wheel). _intel_after="${backend_dir}/requirements-intel-after.txt" diff --git a/backend/python/vllm/requirements-cublas13-after.txt b/backend/python/vllm/requirements-cublas13-after.txt index 62c486139..c04a25ab1 100644 --- a/backend/python/vllm/requirements-cublas13-after.txt +++ b/backend/python/vllm/requirements-cublas13-after.txt @@ -4,4 +4,7 @@ # instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match # so uv consults this index alongside PyPI. --extra-index-url https://wheels.vllm.ai/0.23.0/cu130 +# VERSION COUPLING: darwin/Apple-Silicon builds use vllm-metal (see install.sh), +# which pins this exact vLLM version. Bumping vllm here means coordinating with a +# vllm-metal release that supports the new version, or macOS/Metal builds break. vllm==0.23.0