Compare commits

..

2 Commits

Author SHA1 Message Date
Ettore Di Giacinto
40daa857c7 fix(trl): guard uv-only --index-strategy for the pip/darwin path
The darwin/MPS build installs with pip (USE_PIP=true), which rejects the
uv-only --index-strategy flag and failed the darwin backend build. Add it
only on the uv path; Linux/CUDA resolution is unchanged.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]
2026-06-24 19:55:32 +00:00
Ettore Di Giacinto
c0efc28968 feat(backends): add darwin/metal (MPS) build for trl
Authors backend/python/trl/requirements-mps.txt and wires trl into the
darwin CI matrix and gallery so the MPS training path can be built and
validated on Apple Silicon. The MPS variant installs plain PyPI torch
wheels (MPS-capable on macOS arm64) and the trl training stack; bitsandbytes
is omitted as it is a CUDA-only dependency with poor Apple Silicon support.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]
2026-06-24 17:11:34 +00:00
8 changed files with 33 additions and 213 deletions

View File

@@ -4974,12 +4974,8 @@ includeDarwin:
- backend: "kitten-tts"
tag-suffix: "-metal-darwin-arm64-kitten-tts"
build-type: "mps"
# vLLM on Apple Silicon via vllm-metal (MLX). The install is custom
# (backend/python/vllm/install.sh has a darwin branch); lang stays python so
# backend_build_darwin.yml drives it through build-darwin-python-backend ->
# scripts/build/python-darwin.sh, which runs the backend's install.sh.
- backend: "vllm"
tag-suffix: "-metal-darwin-arm64-vllm"
- backend: "trl"
tag-suffix: "-metal-darwin-arm64-trl"
build-type: "mps"
- backend: "piper"
tag-suffix: "-metal-darwin-arm64-piper"

View File

@@ -1,55 +0,0 @@
#!/bin/bash
# Bump the single vllm-metal pin (VLLM_METAL_VERSION) in the vLLM backend's
# darwin (Apple Silicon) install path. The macOS/Metal build
# (backend/python/vllm/install.sh, Darwin branch) installs vllm-metal, which is
# version-locked to a specific vLLM source release. install.sh derives that vLLM
# version at build time from vllm-metal's own installer (`vllm_v=`) at the pinned
# tag, so there is only ONE value to bump here -- mirroring bump_vllm_wheel.sh,
# which bumps the Linux cu130 wheel pin.
#
# This deliberately tracks vllm-project/vllm-metal, NOT vllm-project/vllm: the
# darwin build can only use the exact vLLM version vllm-metal supports, so it may
# lag the Linux pin (requirements-cublas13-after.txt) until vllm-metal catches up.
set -xe
REPO=$1 # vllm-project/vllm-metal
FILE=$2 # backend/python/vllm/install.sh
VAR=$3 # VLLM_METAL_VERSION (used for the workflow's output file names)
if [ -z "$FILE" ] || [ -z "$REPO" ] || [ -z "$VAR" ]; then
echo "usage: $0 <repo> <install-file> <var-name>" >&2
exit 1
fi
# vllm-metal ships frequent dev releases, all flagged as non-prerelease, so
# /releases/latest returns the newest one (with its cp312 wheel asset).
LATEST_TAG=$(curl -sS -H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/$REPO/releases/latest" \
| python3 -c "import json,sys; print(json.load(sys.stdin)['tag_name'])")
# The coupled vLLM source version lives in vllm-metal's installer at that tag.
NEW_VLLM_VERSION=$(curl -fsSL \
"https://raw.githubusercontent.com/$REPO/$LATEST_TAG/install.sh" \
| grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -1 | cut -d'"' -f2)
if [ -z "$LATEST_TAG" ] || [ -z "$NEW_VLLM_VERSION" ]; then
echo "Could not resolve vllm-metal tag ($LATEST_TAG) or its vllm_v ($NEW_VLLM_VERSION)." >&2
exit 1
fi
set +e
CURRENT_TAG=$(grep -oE 'VLLM_METAL_VERSION="[^"]*"' "$FILE" | head -1 | cut -d'"' -f2)
set -e
# Rewrite the single pin. install.sh derives VLLM_VERSION from this tag at build
# time, so there is nothing else to touch. peter-evans/create-pull-request opens
# no PR on a clean tree, so a no-op rewrite (already current) is safe.
sed -i "$FILE" \
-e "s|VLLM_METAL_VERSION=\"[^\"]*\"|VLLM_METAL_VERSION=\"$LATEST_TAG\"|"
if [ -z "$CURRENT_TAG" ]; then
echo "Could not find VLLM_METAL_VERSION=\"...\" in $FILE." >&2
exit 0
fi
echo "vllm-metal ${CURRENT_TAG} -> ${LATEST_TAG} (builds vLLM ${NEW_VLLM_VERSION}): https://github.com/$REPO/releases/tag/${LATEST_TAG}" >> "${VAR}_message.txt"
echo "${LATEST_TAG}" >> "${VAR}_commit.txt"

View File

@@ -154,39 +154,3 @@ jobs:
branch: "update/VLLM_VERSION"
body: ${{ steps.bump.outputs.message }}
signoff: true
bump-vllm-metal:
# The darwin (Apple Silicon) vLLM build installs vllm-metal, which is locked
# to a specific vLLM source release. install.sh pins both VLLM_METAL_VERSION
# (the wheel release) and VLLM_VERSION (the vLLM it builds against); this job
# tracks vllm-project/vllm-metal and rewrites both atomically. Separate from
# bump-vllm-wheel because darwin follows vllm-metal, not vllm/vllm latest.
if: github.repository == 'mudler/LocalAI'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v7
- name: Bump vllm-metal pin 🔧
id: bump
run: |
bash .github/bump_vllm_metal.sh vllm-project/vllm-metal backend/python/vllm/install.sh VLLM_METAL_VERSION
{
echo 'message<<EOF'
cat "VLLM_METAL_VERSION_message.txt"
echo EOF
} >> "$GITHUB_OUTPUT"
{
echo 'commit<<EOF'
cat "VLLM_METAL_VERSION_commit.txt"
echo EOF
} >> "$GITHUB_OUTPUT"
rm -rfv VLLM_METAL_VERSION_message.txt VLLM_METAL_VERSION_commit.txt
- name: Create Pull Request
uses: peter-evans/create-pull-request@v8
with:
token: ${{ secrets.UPDATE_BOT_TOKEN }}
push-to-fork: ci-forks/LocalAI
commit-message: ':arrow_up: Update vllm-project/vllm-metal (darwin)'
title: 'chore: :arrow_up: Update vllm-metal (darwin) to `${{ steps.bump.outputs.commit }}`'
branch: "update/VLLM_METAL_VERSION"
body: ${{ steps.bump.outputs.message }}
signoff: true

View File

@@ -645,7 +645,6 @@
nvidia-cuda-13: "cuda13-vllm"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm"
cpu: "cpu-vllm"
metal: "metal-vllm"
- &sglang
name: "sglang"
license: apache-2.0
@@ -2928,17 +2927,6 @@
nvidia-cuda-13: "cuda13-vllm-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-development"
cpu: "cpu-vllm-development"
metal: "metal-vllm-development"
- !!merge <<: *vllm
name: "metal-vllm"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-vllm"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-vllm
- !!merge <<: *vllm
name: "metal-vllm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-vllm"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-vllm
- !!merge <<: *vllm
name: "cuda12-vllm"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm"
@@ -5294,6 +5282,7 @@
nvidia: "cuda12-trl"
nvidia-cuda-12: "cuda12-trl"
nvidia-cuda-13: "cuda13-trl"
metal: "metal-trl"
## TRL backend images
- !!merge <<: *trl
name: "cpu-trl"
@@ -5325,6 +5314,16 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-trl"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-trl
- !!merge <<: *trl
name: "metal-trl"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-trl"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-trl
- !!merge <<: *trl
name: "metal-trl-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-trl"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-trl
## llama.cpp quantization backend
- &llama-cpp-quantization
name: "llama-cpp-quantization"

View File

@@ -8,7 +8,13 @@ else
source $backend_dir/../common/libbackend.sh
fi
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade"
# --index-strategy is a uv-only flag. The darwin/MPS build installs with pip
# (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add
# it when uv is the installer, keeping the Linux/CUDA resolution unchanged.
if [ "x${USE_PIP:-}" != "xtrue" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
fi
installRequirements
# Fetch convert_hf_to_gguf.py and gguf package from the same llama.cpp version

View File

@@ -0,0 +1,12 @@
torch==2.10.0
trl
peft
datasets>=3.0.0
transformers>=4.56.2
accelerate>=1.4.0
huggingface-hub>=1.3.0
sentencepiece
# Note: bitsandbytes is intentionally omitted on MPS. It is only used by the
# CUDA (cublas) variants for 8-bit/4-bit quantization and has poor support on
# Apple Silicon. torch here uses the plain PyPI wheels, which ship MPS support
# on macOS arm64.

View File

@@ -43,24 +43,6 @@ if [ "x${BUILD_PROFILE}" == "xcublas13" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
fi
# Apple Silicon (Metal/MLX) via vllm-metal.
# vllm-metal (github.com/vllm-project/vllm-metal) brings vLLM to macOS on Apple
# Silicon: it registers through vLLM's platform-plugin entry point
# (metal -> vllm_metal:register), MetalPlatform activates, and the vLLM v1
# AsyncLLM engine runs on the GPU through MLX. LocalAI's backend.py is UNCHANGED
# on darwin — AsyncEngineArgs(...) -> AsyncLLMEngine.from_engine_args transparently
# resolves to the MLX engine (proven on a real M4 / macOS 26.5 against Qwen3-0.6B).
#
# vllm-metal REQUIRES Python 3.12, so force the portable CPython before the venv
# is created (ensureVenv reads PYTHON_VERSION/PYTHON_PATCH/PY_STANDALONE_TAG).
# The patch + standalone tag mirror the l4t13 cp312 pin — a known-good
# python-build-standalone release that also ships an aarch64-apple-darwin asset.
if [ "$(uname -s)" = "Darwin" ]; then
PYTHON_VERSION="3.12"
PYTHON_PATCH="12"
PY_STANDALONE_TAG="20251120"
fi
# JetPack 7 / L4T arm64 vllm + torch wheels come straight from PyPI now
# (torch 2.11+ ships aarch64 + cu130 manylinux wheels and vllm 0.20+ ships
# an aarch64 wheel pinned to that torch). They're cp312-only, so bump the
@@ -75,92 +57,11 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
PY_STANDALONE_TAG="20251120"
fi
# ===================== Apple Silicon (Metal/MLX) =====================
# Reproduce vllm-metal's upstream installer
# (curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm-metal/main/install.sh)
# but INTO LocalAI's managed venv (ensureVenv) instead of a throwaway
# ~/.venv-vllm-metal, so the backend integrates with LocalAI's venv lifecycle
# (portable CPython, _makeVenvPortable relocation, runtime activation). The
# normal CUDA/CPU installRequirements is skipped on darwin — there is no
# macOS/arm64 vLLM wheel on PyPI; vLLM is built from source and the MLX engine
# is layered on by the vllm-metal wheel.
if [ "$(uname -s)" = "Darwin" ]; then
# Create/activate the portable 3.12 venv. On darwin USE_PIP=true and
# PORTABLE_PYTHON=true (set by scripts/build/python-darwin.sh), so this is a
# `python -m venv` based, relocatable venv.
ensureVenv
# vllm-metal's installer drives everything through `uv`: building vLLM from
# the CPU requirements needs `--index-strategy unsafe-best-match` (mixes the
# pytorch CPU channel with PyPI), a flag plain pip does not have. The darwin
# venv is pip-based, so bootstrap uv into it. uv honours $VIRTUAL_ENV (set by
# libbackend's _activateVenv) and installs into THIS venv — same pattern the
# intel branch below relies on.
pip install uv
# The ONLY darwin version pin -- AUTO-BUMPED by .github/bump_vllm_metal.sh,
# which tracks vllm-project/vllm-metal releases (NOT vllm/vllm latest). Keep
# it as a plain double-quoted assignment on its own line so the bumper's sed
# can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
# vllm pin (requirements-cublas13-after.txt, bumped independently against
# vllm/vllm) until vllm-metal supports a newer vLLM.
VLLM_METAL_VERSION="v0.3.0.dev20260622062346"
# The coupled vLLM source version is whatever this vllm-metal release builds
# against -- it declares it in its own installer as `vllm_v=`. Derive it from
# the PINNED tag rather than hardcoding a second value that could drift. The
# tag is immutable, so this stays reproducible across rebuilds.
VLLM_VERSION=$(curl -fsSL "https://raw.githubusercontent.com/vllm-project/vllm-metal/${VLLM_METAL_VERSION}/install.sh" \
| grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -n1 | cut -d'"' -f2)
if [ -z "${VLLM_VERSION}" ]; then
echo "ERROR: could not derive the vLLM version from vllm-metal ${VLLM_METAL_VERSION}" >&2
exit 1
fi
echo "vllm-metal ${VLLM_METAL_VERSION} builds against vLLM ${VLLM_VERSION}"
_vllm_src=$(mktemp -d)
trap 'rm -rf "${_vllm_src}"' EXIT
pushd "${_vllm_src}"
# 1) Build vLLM ${VLLM_VERSION} from the release source tarball against
# the CPU requirements. vllm-metal layers its MLX platform plugin on
# top of this exact build.
curl -fsSL -o "vllm-${VLLM_VERSION}.tar.gz" \
"https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}.tar.gz"
tar -xzf "vllm-${VLLM_VERSION}.tar.gz"
pushd "vllm-${VLLM_VERSION}"
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
# -Wno-parentheses: clang on macOS treats one of vLLM's C++ warnings
# as an error without it (matches the upstream installer's CXXFLAGS).
CXXFLAGS="-Wno-parentheses" uv pip install .
popd
popd
# 2) Install the prebuilt vllm-metal wheel from the PINNED release
# (${VLLM_METAL_VERSION}). It pulls mlx / mlx-metal as deps and registers
# the `metal` platform plugin that backend.py resolves to at engine-init
# time. Pinning the tag (vs releases/latest) keeps the wheel and the vLLM
# source build above reproducible and coupled; .github/bump_vllm_metal.sh
# advances both together.
_metal_wheel_url=$(curl -fsSL "https://api.github.com/repos/vllm-project/vllm-metal/releases/tags/${VLLM_METAL_VERSION}" \
| grep -oE '"browser_download_url"[[:space:]]*:[[:space:]]*"[^"]+\.whl"' \
| head -n1 | sed -E 's/.*"(https[^"]+)".*/\1/')
if [ -z "${_metal_wheel_url}" ]; then
echo "ERROR: could not resolve a vllm-metal wheel URL for release ${VLLM_METAL_VERSION}" >&2
exit 1
fi
echo "Installing vllm-metal wheel: ${_metal_wheel_url}"
uv pip install "${_metal_wheel_url}"
# Generate the gRPC stubs (backend_pb2*). installRequirements normally does
# this via runProtogen at the end; we skipped installRequirements on darwin,
# so call it explicitly here.
runProtogen
# Intel XPU has no upstream-published vllm wheels, so we always build vllm
# from source against torch-xpu and replace the default triton with
# triton-xpu (matching torch 2.11). Mirrors the upstream procedure:
# https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
elif [ "x${BUILD_TYPE}" == "xintel" ]; then
if [ "x${BUILD_TYPE}" == "xintel" ]; then
# Hide requirements-intel-after.txt so installRequirements doesn't
# try `pip install vllm` (would either fail or grab a non-XPU wheel).
_intel_after="${backend_dir}/requirements-intel-after.txt"

View File

@@ -4,7 +4,4 @@
# instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
# so uv consults this index alongside PyPI.
--extra-index-url https://wheels.vllm.ai/0.23.0/cu130
# VERSION COUPLING: darwin/Apple-Silicon builds use vllm-metal (see install.sh),
# which pins this exact vLLM version. Bumping vllm here means coordinating with a
# vllm-metal release that supports the new version, or macOS/Metal builds break.
vllm==0.23.0