fix(trl): guard uv-only --index-strategy for the pip/darwin path

The darwin/MPS build installs with pip (USE_PIP=true), which rejects the uv-only --index-strategy flag and failed the darwin backend build. Add it only on the uv path; Linux/CUDA resolution is unchanged. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:opus-4.8 [Claude Code]
feat(backends): add darwin/metal (MPS) build for trl
2026-06-24 16:49:06 -04:00 · 2026-06-24 19:55:32 +00:00 · 2026-06-24 17:11:34 +00:00
8 changed files with 33 additions and 213 deletions
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -4974,12 +4974,8 @@ includeDarwin:
  - backend: "kitten-tts"
    tag-suffix: "-metal-darwin-arm64-kitten-tts"
    build-type: "mps"
-  # vLLM on Apple Silicon via vllm-metal (MLX). The install is custom
-  # (backend/python/vllm/install.sh has a darwin branch); lang stays python so
-  # backend_build_darwin.yml drives it through build-darwin-python-backend ->
-  # scripts/build/python-darwin.sh, which runs the backend's install.sh.
-  - backend: "vllm"
-    tag-suffix: "-metal-darwin-arm64-vllm"
+  - backend: "trl"
+    tag-suffix: "-metal-darwin-arm64-trl"
    build-type: "mps"
  - backend: "piper"
    tag-suffix: "-metal-darwin-arm64-piper"
--- a/.github/bump_vllm_metal.sh
+++ b/.github/bump_vllm_metal.sh
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Bump the single vllm-metal pin (VLLM_METAL_VERSION) in the vLLM backend's
-# darwin (Apple Silicon) install path. The macOS/Metal build
-# (backend/python/vllm/install.sh, Darwin branch) installs vllm-metal, which is
-# version-locked to a specific vLLM source release. install.sh derives that vLLM
-# version at build time from vllm-metal's own installer (`vllm_v=`) at the pinned
-# tag, so there is only ONE value to bump here -- mirroring bump_vllm_wheel.sh,
-# which bumps the Linux cu130 wheel pin.
-#
-# This deliberately tracks vllm-project/vllm-metal, NOT vllm-project/vllm: the
-# darwin build can only use the exact vLLM version vllm-metal supports, so it may
-# lag the Linux pin (requirements-cublas13-after.txt) until vllm-metal catches up.
-set -xe
-REPO=$1   # vllm-project/vllm-metal
-FILE=$2   # backend/python/vllm/install.sh
-VAR=$3    # VLLM_METAL_VERSION (used for the workflow's output file names)
-
-if [ -z "$FILE" ] || [ -z "$REPO" ] || [ -z "$VAR" ]; then
-    echo "usage: $0 <repo> <install-file> <var-name>" >&2
-    exit 1
-fi
-
-# vllm-metal ships frequent dev releases, all flagged as non-prerelease, so
-# /releases/latest returns the newest one (with its cp312 wheel asset).
-LATEST_TAG=$(curl -sS -H "Accept: application/vnd.github+json" \
-    "https://api.github.com/repos/$REPO/releases/latest" \
-    | python3 -c "import json,sys; print(json.load(sys.stdin)['tag_name'])")
-
-# The coupled vLLM source version lives in vllm-metal's installer at that tag.
-NEW_VLLM_VERSION=$(curl -fsSL \
-    "https://raw.githubusercontent.com/$REPO/$LATEST_TAG/install.sh" \
-    | grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -1 | cut -d'"' -f2)
-
-if [ -z "$LATEST_TAG" ] || [ -z "$NEW_VLLM_VERSION" ]; then
-    echo "Could not resolve vllm-metal tag ($LATEST_TAG) or its vllm_v ($NEW_VLLM_VERSION)." >&2
-    exit 1
-fi
-
-set +e
-CURRENT_TAG=$(grep -oE 'VLLM_METAL_VERSION="[^"]*"' "$FILE" | head -1 | cut -d'"' -f2)
-set -e
-
-# Rewrite the single pin. install.sh derives VLLM_VERSION from this tag at build
-# time, so there is nothing else to touch. peter-evans/create-pull-request opens
-# no PR on a clean tree, so a no-op rewrite (already current) is safe.
-sed -i "$FILE" \
-    -e "s|VLLM_METAL_VERSION=\"[^\"]*\"|VLLM_METAL_VERSION=\"$LATEST_TAG\"|"
-
-if [ -z "$CURRENT_TAG" ]; then
-    echo "Could not find VLLM_METAL_VERSION=\"...\" in $FILE." >&2
-    exit 0
-fi
-
-echo "vllm-metal ${CURRENT_TAG} -> ${LATEST_TAG} (builds vLLM ${NEW_VLLM_VERSION}): https://github.com/$REPO/releases/tag/${LATEST_TAG}" >> "${VAR}_message.txt"
-echo "${LATEST_TAG}" >> "${VAR}_commit.txt"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -154,39 +154,3 @@ jobs:
          branch: "update/VLLM_VERSION"
          body: ${{ steps.bump.outputs.message }}
          signoff: true
-
-  bump-vllm-metal:
-    # The darwin (Apple Silicon) vLLM build installs vllm-metal, which is locked
-    # to a specific vLLM source release. install.sh pins both VLLM_METAL_VERSION
-    # (the wheel release) and VLLM_VERSION (the vLLM it builds against); this job
-    # tracks vllm-project/vllm-metal and rewrites both atomically. Separate from
-    # bump-vllm-wheel because darwin follows vllm-metal, not vllm/vllm latest.
-    if: github.repository == 'mudler/LocalAI'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v7
-      - name: Bump vllm-metal pin 🔧
-        id: bump
-        run: |
-          bash .github/bump_vllm_metal.sh vllm-project/vllm-metal backend/python/vllm/install.sh VLLM_METAL_VERSION
-          {
-            echo 'message<<EOF'
-            cat "VLLM_METAL_VERSION_message.txt"
-            echo EOF
-          } >> "$GITHUB_OUTPUT"
-          {
-            echo 'commit<<EOF'
-            cat "VLLM_METAL_VERSION_commit.txt"
-            echo EOF
-          } >> "$GITHUB_OUTPUT"
-          rm -rfv VLLM_METAL_VERSION_message.txt VLLM_METAL_VERSION_commit.txt
-      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v8
-        with:
-          token: ${{ secrets.UPDATE_BOT_TOKEN }}
-          push-to-fork: ci-forks/LocalAI
-          commit-message: ':arrow_up: Update vllm-project/vllm-metal (darwin)'
-          title: 'chore: :arrow_up: Update vllm-metal (darwin) to `${{ steps.bump.outputs.commit }}`'
-          branch: "update/VLLM_METAL_VERSION"
-          body: ${{ steps.bump.outputs.message }}
-          signoff: true
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -645,7 +645,6 @@
    nvidia-cuda-13: "cuda13-vllm"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm"
    cpu: "cpu-vllm"
-    metal: "metal-vllm"
 - &sglang
  name: "sglang"
  license: apache-2.0
@@ -2928,17 +2927,6 @@
    nvidia-cuda-13: "cuda13-vllm-development"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-development"
    cpu: "cpu-vllm-development"
-    metal: "metal-vllm-development"
- !!merge <<: *vllm
-  name: "metal-vllm"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-vllm"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-vllm
- !!merge <<: *vllm
-  name: "metal-vllm-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-vllm"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-vllm
 - !!merge <<: *vllm
  name: "cuda12-vllm"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm"
@@ -5294,6 +5282,7 @@
    nvidia: "cuda12-trl"
    nvidia-cuda-12: "cuda12-trl"
    nvidia-cuda-13: "cuda13-trl"
+    metal: "metal-trl"
 ## TRL backend images
 - !!merge <<: *trl
  name: "cpu-trl"
@@ -5325,6 +5314,16 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-trl"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-trl
+- !!merge <<: *trl
+  name: "metal-trl"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-trl"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-trl
+- !!merge <<: *trl
+  name: "metal-trl-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-trl"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-trl
 ## llama.cpp quantization backend
 - &llama-cpp-quantization
  name: "llama-cpp-quantization"
--- a/backend/python/trl/install.sh
+++ b/backend/python/trl/install.sh
@@ -8,7 +8,13 @@ else
    source $backend_dir/../common/libbackend.sh
 fi

-EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+EXTRA_PIP_INSTALL_FLAGS+=" --upgrade"
+# --index-strategy is a uv-only flag. The darwin/MPS build installs with pip
+# (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add
+# it when uv is the installer, keeping the Linux/CUDA resolution unchanged.
+if [ "x${USE_PIP:-}" != "xtrue" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
+fi
 installRequirements

 # Fetch convert_hf_to_gguf.py and gguf package from the same llama.cpp version
--- a/backend/python/trl/requirements-mps.txt
+++ b/backend/python/trl/requirements-mps.txt
@@ -0,0 +1,12 @@
+torch==2.10.0
+trl
+peft
+datasets>=3.0.0
+transformers>=4.56.2
+accelerate>=1.4.0
+huggingface-hub>=1.3.0
+sentencepiece
+# Note: bitsandbytes is intentionally omitted on MPS. It is only used by the
+# CUDA (cublas) variants for 8-bit/4-bit quantization and has poor support on
+# Apple Silicon. torch here uses the plain PyPI wheels, which ship MPS support
+# on macOS arm64.
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -43,24 +43,6 @@ if [ "x${BUILD_PROFILE}" == "xcublas13" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
 fi

-# Apple Silicon (Metal/MLX) via vllm-metal.
-# vllm-metal (github.com/vllm-project/vllm-metal) brings vLLM to macOS on Apple
-# Silicon: it registers through vLLM's platform-plugin entry point
-# (metal -> vllm_metal:register), MetalPlatform activates, and the vLLM v1
-# AsyncLLM engine runs on the GPU through MLX. LocalAI's backend.py is UNCHANGED
-# on darwin — AsyncEngineArgs(...) -> AsyncLLMEngine.from_engine_args transparently
-# resolves to the MLX engine (proven on a real M4 / macOS 26.5 against Qwen3-0.6B).
-#
-# vllm-metal REQUIRES Python 3.12, so force the portable CPython before the venv
-# is created (ensureVenv reads PYTHON_VERSION/PYTHON_PATCH/PY_STANDALONE_TAG).
-# The patch + standalone tag mirror the l4t13 cp312 pin — a known-good
-# python-build-standalone release that also ships an aarch64-apple-darwin asset.
-if [ "$(uname -s)" = "Darwin" ]; then
-    PYTHON_VERSION="3.12"
-    PYTHON_PATCH="12"
-    PY_STANDALONE_TAG="20251120"
-fi
-
 # JetPack 7 / L4T arm64 vllm + torch wheels come straight from PyPI now
 # (torch 2.11+ ships aarch64 + cu130 manylinux wheels and vllm 0.20+ ships
 # an aarch64 wheel pinned to that torch). They're cp312-only, so bump the
@@ -75,92 +57,11 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
    PY_STANDALONE_TAG="20251120"
 fi

-# ===================== Apple Silicon (Metal/MLX) =====================
-# Reproduce vllm-metal's upstream installer
-# (curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm-metal/main/install.sh)
-# but INTO LocalAI's managed venv (ensureVenv) instead of a throwaway
-# ~/.venv-vllm-metal, so the backend integrates with LocalAI's venv lifecycle
-# (portable CPython, _makeVenvPortable relocation, runtime activation). The
-# normal CUDA/CPU installRequirements is skipped on darwin — there is no
-# macOS/arm64 vLLM wheel on PyPI; vLLM is built from source and the MLX engine
-# is layered on by the vllm-metal wheel.
-if [ "$(uname -s)" = "Darwin" ]; then
-    # Create/activate the portable 3.12 venv. On darwin USE_PIP=true and
-    # PORTABLE_PYTHON=true (set by scripts/build/python-darwin.sh), so this is a
-    # `python -m venv` based, relocatable venv.
-    ensureVenv
-
-    # vllm-metal's installer drives everything through `uv`: building vLLM from
-    # the CPU requirements needs `--index-strategy unsafe-best-match` (mixes the
-    # pytorch CPU channel with PyPI), a flag plain pip does not have. The darwin
-    # venv is pip-based, so bootstrap uv into it. uv honours $VIRTUAL_ENV (set by
-    # libbackend's _activateVenv) and installs into THIS venv — same pattern the
-    # intel branch below relies on.
-    pip install uv
-
-    # The ONLY darwin version pin -- AUTO-BUMPED by .github/bump_vllm_metal.sh,
-    # which tracks vllm-project/vllm-metal releases (NOT vllm/vllm latest). Keep
-    # it as a plain double-quoted assignment on its own line so the bumper's sed
-    # can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
-    # vllm pin (requirements-cublas13-after.txt, bumped independently against
-    # vllm/vllm) until vllm-metal supports a newer vLLM.
-    VLLM_METAL_VERSION="v0.3.0.dev20260622062346"
-
-    # The coupled vLLM source version is whatever this vllm-metal release builds
-    # against -- it declares it in its own installer as `vllm_v=`. Derive it from
-    # the PINNED tag rather than hardcoding a second value that could drift. The
-    # tag is immutable, so this stays reproducible across rebuilds.
-    VLLM_VERSION=$(curl -fsSL "https://raw.githubusercontent.com/vllm-project/vllm-metal/${VLLM_METAL_VERSION}/install.sh" \
-        | grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -n1 | cut -d'"' -f2)
-    if [ -z "${VLLM_VERSION}" ]; then
-        echo "ERROR: could not derive the vLLM version from vllm-metal ${VLLM_METAL_VERSION}" >&2
-        exit 1
-    fi
-    echo "vllm-metal ${VLLM_METAL_VERSION} builds against vLLM ${VLLM_VERSION}"
-
-    _vllm_src=$(mktemp -d)
-    trap 'rm -rf "${_vllm_src}"' EXIT
-    pushd "${_vllm_src}"
-        # 1) Build vLLM ${VLLM_VERSION} from the release source tarball against
-        #    the CPU requirements. vllm-metal layers its MLX platform plugin on
-        #    top of this exact build.
-        curl -fsSL -o "vllm-${VLLM_VERSION}.tar.gz" \
-            "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}.tar.gz"
-        tar -xzf "vllm-${VLLM_VERSION}.tar.gz"
-        pushd "vllm-${VLLM_VERSION}"
-            uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
-            # -Wno-parentheses: clang on macOS treats one of vLLM's C++ warnings
-            # as an error without it (matches the upstream installer's CXXFLAGS).
-            CXXFLAGS="-Wno-parentheses" uv pip install .
-        popd
-    popd
-
-    # 2) Install the prebuilt vllm-metal wheel from the PINNED release
-    #    (${VLLM_METAL_VERSION}). It pulls mlx / mlx-metal as deps and registers
-    #    the `metal` platform plugin that backend.py resolves to at engine-init
-    #    time. Pinning the tag (vs releases/latest) keeps the wheel and the vLLM
-    #    source build above reproducible and coupled; .github/bump_vllm_metal.sh
-    #    advances both together.
-    _metal_wheel_url=$(curl -fsSL "https://api.github.com/repos/vllm-project/vllm-metal/releases/tags/${VLLM_METAL_VERSION}" \
-        | grep -oE '"browser_download_url"[[:space:]]*:[[:space:]]*"[^"]+\.whl"' \
-        | head -n1 | sed -E 's/.*"(https[^"]+)".*/\1/')
-    if [ -z "${_metal_wheel_url}" ]; then
-        echo "ERROR: could not resolve a vllm-metal wheel URL for release ${VLLM_METAL_VERSION}" >&2
-        exit 1
-    fi
-    echo "Installing vllm-metal wheel: ${_metal_wheel_url}"
-    uv pip install "${_metal_wheel_url}"
-
-    # Generate the gRPC stubs (backend_pb2*). installRequirements normally does
-    # this via runProtogen at the end; we skipped installRequirements on darwin,
-    # so call it explicitly here.
-    runProtogen
-
 # Intel XPU has no upstream-published vllm wheels, so we always build vllm
 # from source against torch-xpu and replace the default triton with
 # triton-xpu (matching torch 2.11). Mirrors the upstream procedure:
 # https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
-elif [ "x${BUILD_TYPE}" == "xintel" ]; then
+if [ "x${BUILD_TYPE}" == "xintel" ]; then
    # Hide requirements-intel-after.txt so installRequirements doesn't
    # try `pip install vllm` (would either fail or grab a non-XPU wheel).
    _intel_after="${backend_dir}/requirements-intel-after.txt"
--- a/backend/python/vllm/requirements-cublas13-after.txt
+++ b/backend/python/vllm/requirements-cublas13-after.txt
@@ -4,7 +4,4 @@
 # instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
 # so uv consults this index alongside PyPI.
 --extra-index-url https://wheels.vllm.ai/0.23.0/cu130
-# VERSION COUPLING: darwin/Apple-Silicon builds use vllm-metal (see install.sh),
-# which pins this exact vLLM version. Bumping vllm here means coordinating with a
-# vllm-metal release that supports the new version, or macOS/Metal builds break.
 vllm==0.23.0