diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index f34921db9..5ad6d9e16 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -4974,6 +4974,13 @@ includeDarwin: - backend: "kitten-tts" tag-suffix: "-metal-darwin-arm64-kitten-tts" build-type: "mps" + # vLLM on Apple Silicon via vllm-metal (MLX). The install is custom + # (backend/python/vllm/install.sh has a darwin branch); lang stays python so + # backend_build_darwin.yml drives it through build-darwin-python-backend -> + # scripts/build/python-darwin.sh, which runs the backend's install.sh. + - backend: "vllm" + tag-suffix: "-metal-darwin-arm64-vllm" + build-type: "mps" - backend: "trl" tag-suffix: "-metal-darwin-arm64-trl" build-type: "mps" diff --git a/.github/bump_vllm_metal.sh b/.github/bump_vllm_metal.sh new file mode 100755 index 000000000..f842680d5 --- /dev/null +++ b/.github/bump_vllm_metal.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Bump the single vllm-metal pin (VLLM_METAL_VERSION) in the vLLM backend's +# darwin (Apple Silicon) install path. The macOS/Metal build +# (backend/python/vllm/install.sh, Darwin branch) installs vllm-metal, which is +# version-locked to a specific vLLM source release. install.sh derives that vLLM +# version at build time from vllm-metal's own installer (`vllm_v=`) at the pinned +# tag, so there is only ONE value to bump here -- mirroring bump_vllm_wheel.sh, +# which bumps the Linux cu130 wheel pin. +# +# This deliberately tracks vllm-project/vllm-metal, NOT vllm-project/vllm: the +# darwin build can only use the exact vLLM version vllm-metal supports, so it may +# lag the Linux pin (requirements-cublas13-after.txt) until vllm-metal catches up. +set -xe +REPO=$1 # vllm-project/vllm-metal +FILE=$2 # backend/python/vllm/install.sh +VAR=$3 # VLLM_METAL_VERSION (used for the workflow's output file names) + +if [ -z "$FILE" ] || [ -z "$REPO" ] || [ -z "$VAR" ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +# vllm-metal ships frequent dev releases, all flagged as non-prerelease, so +# /releases/latest returns the newest one (with its cp312 wheel asset). +LATEST_TAG=$(curl -sS -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/$REPO/releases/latest" \ + | python3 -c "import json,sys; print(json.load(sys.stdin)['tag_name'])") + +# The coupled vLLM source version lives in vllm-metal's installer at that tag. +NEW_VLLM_VERSION=$(curl -fsSL \ + "https://raw.githubusercontent.com/$REPO/$LATEST_TAG/install.sh" \ + | grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -1 | cut -d'"' -f2) + +if [ -z "$LATEST_TAG" ] || [ -z "$NEW_VLLM_VERSION" ]; then + echo "Could not resolve vllm-metal tag ($LATEST_TAG) or its vllm_v ($NEW_VLLM_VERSION)." >&2 + exit 1 +fi + +set +e +CURRENT_TAG=$(grep -oE 'VLLM_METAL_VERSION="[^"]*"' "$FILE" | head -1 | cut -d'"' -f2) +set -e + +# Rewrite the single pin. install.sh derives VLLM_VERSION from this tag at build +# time, so there is nothing else to touch. peter-evans/create-pull-request opens +# no PR on a clean tree, so a no-op rewrite (already current) is safe. +sed -i "$FILE" \ + -e "s|VLLM_METAL_VERSION=\"[^\"]*\"|VLLM_METAL_VERSION=\"$LATEST_TAG\"|" + +if [ -z "$CURRENT_TAG" ]; then + echo "Could not find VLLM_METAL_VERSION=\"...\" in $FILE." >&2 + exit 0 +fi + +echo "vllm-metal ${CURRENT_TAG} -> ${LATEST_TAG} (builds vLLM ${NEW_VLLM_VERSION}): https://github.com/$REPO/releases/tag/${LATEST_TAG}" >> "${VAR}_message.txt" +echo "${LATEST_TAG}" >> "${VAR}_commit.txt" diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index aa4b21af7..a2c37881f 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -154,3 +154,39 @@ jobs: branch: "update/VLLM_VERSION" body: ${{ steps.bump.outputs.message }} signoff: true + + bump-vllm-metal: + # The darwin (Apple Silicon) vLLM build installs vllm-metal, which is locked + # to a specific vLLM source release. install.sh pins both VLLM_METAL_VERSION + # (the wheel release) and VLLM_VERSION (the vLLM it builds against); this job + # tracks vllm-project/vllm-metal and rewrites both atomically. Separate from + # bump-vllm-wheel because darwin follows vllm-metal, not vllm/vllm latest. + if: github.repository == 'mudler/LocalAI' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v7 + - name: Bump vllm-metal pin 🔧 + id: bump + run: | + bash .github/bump_vllm_metal.sh vllm-project/vllm-metal backend/python/vllm/install.sh VLLM_METAL_VERSION + { + echo 'message<> "$GITHUB_OUTPUT" + { + echo 'commit<> "$GITHUB_OUTPUT" + rm -rfv VLLM_METAL_VERSION_message.txt VLLM_METAL_VERSION_commit.txt + - name: Create Pull Request + uses: peter-evans/create-pull-request@v8 + with: + token: ${{ secrets.UPDATE_BOT_TOKEN }} + push-to-fork: ci-forks/LocalAI + commit-message: ':arrow_up: Update vllm-project/vllm-metal (darwin)' + title: 'chore: :arrow_up: Update vllm-metal (darwin) to `${{ steps.bump.outputs.commit }}`' + branch: "update/VLLM_METAL_VERSION" + body: ${{ steps.bump.outputs.message }} + signoff: true diff --git a/backend/index.yaml b/backend/index.yaml index 381aa073b..4a7a07d82 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -645,6 +645,7 @@ nvidia-cuda-13: "cuda13-vllm" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm" cpu: "cpu-vllm" + metal: "metal-vllm" - &sglang name: "sglang" license: apache-2.0 @@ -2929,6 +2930,17 @@ nvidia-cuda-13: "cuda13-vllm-development" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-development" cpu: "cpu-vllm-development" + metal: "metal-vllm-development" +- !!merge <<: *vllm + name: "metal-vllm" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-vllm" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-vllm +- !!merge <<: *vllm + name: "metal-vllm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-vllm" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-vllm - !!merge <<: *vllm name: "cuda12-vllm" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm" diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index a38849137..1e93f26e2 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -457,9 +457,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): except Exception: pass - if last_output is None or not getattr(last_output, "prompt_logprobs", None): - context.set_code(grpc.StatusCode.INTERNAL) - context.set_details("vLLM did not return prompt_logprobs") + _pl = getattr(last_output, "prompt_logprobs", None) if last_output is not None else None + # Some engines accept the prompt_logprobs request but return a + # list of all-None entries instead of computing them (observed + # with vllm-metal's MLX backend on macOS). Treat that as + # unsupported rather than silently scoring every candidate as 0. + if not _pl or all(e is None for e in _pl): + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("This backend did not return prompt_logprobs; scoring is unsupported on this engine (e.g. vllm-metal / MLX on macOS).") return backend_pb2.ScoreResponse() prompt_logprobs = last_output.prompt_logprobs diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 320ef6772..85c1e97b0 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -43,6 +43,24 @@ if [ "x${BUILD_PROFILE}" == "xcublas13" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi +# Apple Silicon (Metal/MLX) via vllm-metal. +# vllm-metal (github.com/vllm-project/vllm-metal) brings vLLM to macOS on Apple +# Silicon: it registers through vLLM's platform-plugin entry point +# (metal -> vllm_metal:register), MetalPlatform activates, and the vLLM v1 +# AsyncLLM engine runs on the GPU through MLX. LocalAI's backend.py is UNCHANGED +# on darwin — AsyncEngineArgs(...) -> AsyncLLMEngine.from_engine_args transparently +# resolves to the MLX engine (proven on a real M4 / macOS 26.5 against Qwen3-0.6B). +# +# vllm-metal REQUIRES Python 3.12, so force the portable CPython before the venv +# is created (ensureVenv reads PYTHON_VERSION/PYTHON_PATCH/PY_STANDALONE_TAG). +# The patch + standalone tag mirror the l4t13 cp312 pin — a known-good +# python-build-standalone release that also ships an aarch64-apple-darwin asset. +if [ "$(uname -s)" = "Darwin" ]; then + PYTHON_VERSION="3.12" + PYTHON_PATCH="12" + PY_STANDALONE_TAG="20251120" +fi + # JetPack 7 / L4T arm64 vllm + torch wheels come straight from PyPI now # (torch 2.11+ ships aarch64 + cu130 manylinux wheels and vllm 0.20+ ships # an aarch64 wheel pinned to that torch). They're cp312-only, so bump the @@ -57,11 +75,87 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then PY_STANDALONE_TAG="20251120" fi +# ===================== Apple Silicon (Metal/MLX) ===================== +# Reproduce vllm-metal's upstream installer +# (curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm-metal/main/install.sh) +# but INTO LocalAI's managed venv (ensureVenv) instead of a throwaway +# ~/.venv-vllm-metal, so the backend integrates with LocalAI's venv lifecycle +# (portable CPython, _makeVenvPortable relocation, runtime activation). The +# normal CUDA/CPU installRequirements is skipped on darwin — there is no +# macOS/arm64 vLLM wheel on PyPI; vLLM is built from source and the MLX engine +# is layered on by the vllm-metal wheel. +if [ "$(uname -s)" = "Darwin" ]; then + # Create/activate the portable 3.12 venv. On darwin USE_PIP=true and + # PORTABLE_PYTHON=true (set by scripts/build/python-darwin.sh), so this is a + # `python -m venv` based, relocatable venv. + ensureVenv + + # vllm-metal's installer drives everything through `uv`: building vLLM from + # the CPU requirements needs `--index-strategy unsafe-best-match` (mixes the + # pytorch CPU channel with PyPI), a flag plain pip does not have. The darwin + # venv is pip-based, so bootstrap uv into it. uv honours $VIRTUAL_ENV (set by + # libbackend's _activateVenv) and installs into THIS venv — same pattern the + # intel branch below relies on. + pip install uv + + # The ONLY darwin version pin -- AUTO-BUMPED by .github/bump_vllm_metal.sh, + # which tracks vllm-project/vllm-metal releases (NOT vllm/vllm latest). Keep + # it as a plain double-quoted assignment on its own line so the bumper's sed + # can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux + # vllm pin (requirements-cublas13-after.txt, bumped independently against + # vllm/vllm) until vllm-metal supports a newer vLLM. + VLLM_METAL_VERSION="v0.3.0.dev20260622062346" + + # The coupled vLLM source version is whatever this vllm-metal release builds + # against -- it declares it in its own installer as `vllm_v=`. Derive it from + # the PINNED tag rather than hardcoding a second value that could drift. The + # tag is immutable, so this stays reproducible across rebuilds. + VLLM_VERSION=$(curl -fsSL "https://raw.githubusercontent.com/vllm-project/vllm-metal/${VLLM_METAL_VERSION}/install.sh" \ + | grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -n1 | cut -d'"' -f2) + if [ -z "${VLLM_VERSION}" ]; then + echo "ERROR: could not derive the vLLM version from vllm-metal ${VLLM_METAL_VERSION}" >&2 + exit 1 + fi + echo "vllm-metal ${VLLM_METAL_VERSION} builds against vLLM ${VLLM_VERSION}" + + _vllm_src=$(mktemp -d) + trap 'rm -rf "${_vllm_src}"' EXIT + pushd "${_vllm_src}" + # 1) Build vLLM ${VLLM_VERSION} from the release source tarball against + # the CPU requirements. vllm-metal layers its MLX platform plugin on + # top of this exact build. + curl -fsSL -o "vllm-${VLLM_VERSION}.tar.gz" \ + "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}.tar.gz" + tar -xzf "vllm-${VLLM_VERSION}.tar.gz" + pushd "vllm-${VLLM_VERSION}" + uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match + # -Wno-parentheses: clang on macOS treats one of vLLM's C++ warnings + # as an error without it (matches the upstream installer's CXXFLAGS). + CXXFLAGS="-Wno-parentheses" uv pip install . + popd + popd + + # 2) Install the prebuilt vllm-metal wheel for the PINNED release. It pulls + # mlx / mlx-metal as deps and registers the `metal` platform plugin that + # backend.py resolves to at engine-init time. Build the release-asset URL + # deterministically (tag + the cp312/arm64 wheel name) rather than querying + # api.github.com, whose unauthenticated rate limit (60/hr per IP) 403s on + # shared CI runners. The wheel version is the tag without its leading 'v'. + _metal_wheel="vllm_metal-${VLLM_METAL_VERSION#v}-cp312-cp312-macosx_11_0_arm64.whl" + _metal_wheel_url="https://github.com/vllm-project/vllm-metal/releases/download/${VLLM_METAL_VERSION}/${_metal_wheel}" + echo "Installing vllm-metal wheel: ${_metal_wheel_url}" + uv pip install "${_metal_wheel_url}" + + # Generate the gRPC stubs (backend_pb2*). installRequirements normally does + # this via runProtogen at the end; we skipped installRequirements on darwin, + # so call it explicitly here. + runProtogen + # Intel XPU has no upstream-published vllm wheels, so we always build vllm # from source against torch-xpu and replace the default triton with # triton-xpu (matching torch 2.11). Mirrors the upstream procedure: # https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md -if [ "x${BUILD_TYPE}" == "xintel" ]; then +elif [ "x${BUILD_TYPE}" == "xintel" ]; then # Hide requirements-intel-after.txt so installRequirements doesn't # try `pip install vllm` (would either fail or grab a non-XPU wheel). _intel_after="${backend_dir}/requirements-intel-after.txt" diff --git a/backend/python/vllm/requirements-cublas13-after.txt b/backend/python/vllm/requirements-cublas13-after.txt index 62c486139..c04a25ab1 100644 --- a/backend/python/vllm/requirements-cublas13-after.txt +++ b/backend/python/vllm/requirements-cublas13-after.txt @@ -4,4 +4,7 @@ # instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match # so uv consults this index alongside PyPI. --extra-index-url https://wheels.vllm.ai/0.23.0/cu130 +# VERSION COUPLING: darwin/Apple-Silicon builds use vllm-metal (see install.sh), +# which pins this exact vLLM version. Bumping vllm here means coordinating with a +# vllm-metal release that supports the new version, or macOS/Metal builds break. vllm==0.23.0