diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index fbc830750..bcbc4aebd 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -5071,6 +5071,16 @@ includeDarwin: - backend: "llama-cpp" tag-suffix: "-metal-darwin-arm64-llama-cpp" lang: "go" + # llama-cpp-localai-paged on Darwin: same bespoke CPU_ALL_VARIANTS + Metal build + # as stock llama-cpp (driven by make backends/llama-cpp-localai-paged-darwin), + # reusing backend/cpp/llama-cpp sources with LLAMA_PAGED=on. lang=go selects the + # runner/toolchain only; the source path is C++. Metal delivers paged-KV (the + # NVFP4 FP4-MMA fast path is CUDA/Blackwell-only) and the GDN/conv fused ops have + # no Metal kernel, so a gated-DeltaNet (qwen35) model falls back to the CPU + # reference op at runtime (made safe by the fused-op backend gate, patch 0030). + - backend: "llama-cpp-localai-paged" + tag-suffix: "-metal-darwin-arm64-llama-cpp-localai-paged" + lang: "go" - backend: "stablediffusion-ggml" tag-suffix: "-metal-darwin-arm64-stablediffusion-ggml" build-type: "metal" diff --git a/.github/workflows/backend_build_darwin.yml b/.github/workflows/backend_build_darwin.yml index c0ded5b85..2af407cab 100644 --- a/.github/workflows/backend_build_darwin.yml +++ b/.github/workflows/backend_build_darwin.yml @@ -169,14 +169,14 @@ jobs: # invalidates cleanly; restore-keys fall back to the latest entry for the # same pin so unchanged TUs stay warm even when the cache is fresh. - name: Compute llama.cpp version - if: inputs.backend == 'llama-cpp' + if: inputs.backend == 'llama-cpp' || inputs.backend == 'llama-cpp-localai-paged' id: llama-version run: | version=$(grep '^LLAMA_VERSION' backend/cpp/llama-cpp/Makefile | head -1 | cut -d= -f2 | cut -d'?' -f1 | tr -d ' ') echo "version=${version}" >> "$GITHUB_OUTPUT" - name: Restore ccache - if: inputs.backend == 'llama-cpp' + if: inputs.backend == 'llama-cpp' || inputs.backend == 'llama-cpp-localai-paged' id: ccache-cache uses: actions/cache/restore@v4 with: @@ -186,7 +186,7 @@ jobs: ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}- - name: Configure ccache - if: inputs.backend == 'llama-cpp' + if: inputs.backend == 'llama-cpp' || inputs.backend == 'llama-cpp-localai-paged' run: | mkdir -p "$HOME/Library/Caches/ccache" ccache -M 2G @@ -230,6 +230,16 @@ jobs: make protogen-go make backends/llama-cpp-darwin + # llama-cpp-localai-paged reuses the same bespoke llama-cpp darwin build path + # (CPU_ALL_VARIANTS + Metal + otool dylib bundling) via its own wrapper script, + # so it gets a dedicated step like stock llama-cpp rather than the generic + # build-darwin-go-backend mold. + - name: Build ${{ inputs.backend }}-darwin (llama-cpp-localai-paged) + if: inputs.backend == 'llama-cpp-localai-paged' + run: | + make protogen-go + make backends/llama-cpp-localai-paged-darwin + - name: Build ds4 backend (Darwin Metal) if: inputs.backend == 'ds4' run: | @@ -245,15 +255,20 @@ jobs: make backends/privacy-filter-darwin - name: Build ${{ inputs.backend }}-darwin - if: inputs.backend != 'llama-cpp' && inputs.backend != 'ds4' && inputs.backend != 'privacy-filter' + if: inputs.backend != 'llama-cpp' && inputs.backend != 'llama-cpp-localai-paged' && inputs.backend != 'ds4' && inputs.backend != 'privacy-filter' run: | make protogen-go BACKEND=${{ inputs.backend }} BUILD_TYPE=${{ inputs.build-type }} USE_PIP=${{ inputs.use-pip }} make build-darwin-${{ inputs.lang }}-backend - name: ccache stats - if: inputs.backend == 'llama-cpp' + if: inputs.backend == 'llama-cpp' || inputs.backend == 'llama-cpp-localai-paged' run: ccache -s + # Only stock llama-cpp persists the ccache: both backends share the same + # ccache-llama--- key, so the paged job restores from + # the shared prefix (warm) but must NOT also save under the identical key in + # the same run (it would collide). The shared upstream TUs stay warm via the + # stock save; the paged-only patched TUs are a small recompile. - name: Save ccache if: inputs.backend == 'llama-cpp' && github.event_name != 'pull_request' uses: actions/cache/save@v4 diff --git a/Makefile b/Makefile index 81571bf0d..a9909d553 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio backends/supertonic backends/depth-anything-cpp backends/privacy-filter backends/privacy-filter-darwin backends/llama-cpp-localai-paged +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio backends/supertonic backends/depth-anything-cpp backends/privacy-filter backends/privacy-filter-darwin backends/llama-cpp-localai-paged backends/llama-cpp-localai-paged-darwin GOCMD=go GOTEST=$(GOCMD) test @@ -1134,6 +1134,13 @@ backends/llama-cpp-darwin: build bash ./scripts/build/llama-cpp-darwin.sh ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" +# llama-cpp-localai-paged on Darwin: same bespoke CPU_ALL_VARIANTS + Metal build as +# stock llama-cpp (otool dylib bundling), driven through the paged wrapper Makefile +# with LLAMA_PAGED=on. Mirrors backends/llama-cpp-darwin. +backends/llama-cpp-localai-paged-darwin: build + bash ./scripts/build/llama-cpp-localai-paged-darwin.sh + ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp-localai-paged.tar)" + backends/ds4-darwin: build bash ./scripts/build/ds4-darwin.sh ./local-ai backends install "ocifile://$(abspath ./backend-images/ds4.tar)" diff --git a/backend/index.yaml b/backend/index.yaml index 2df4a5920..248738c8f 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -94,6 +94,7 @@ - LLM - CPU - GPU + - Metal - CUDA - HIP - paged-attention @@ -103,6 +104,7 @@ nvidia: "cuda12-llama-cpp-localai-paged" intel: "intel-sycl-f16-llama-cpp-localai-paged" amd: "rocm-llama-cpp-localai-paged" + metal: "metal-llama-cpp-localai-paged" vulkan: "vulkan-llama-cpp-localai-paged" nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-localai-paged" nvidia-cuda-13: "cuda13-llama-cpp-localai-paged" @@ -2428,6 +2430,16 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-llama-cpp-localai-paged" mirrors: - localai/localai-backends:master-gpu-vulkan-llama-cpp-localai-paged +- !!merge <<: *llamacpplocalaipaged + name: "metal-llama-cpp-localai-paged" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp-localai-paged" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-llama-cpp-localai-paged +- !!merge <<: *llamacpplocalaipaged + name: "metal-llama-cpp-localai-paged-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-llama-cpp-localai-paged" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-llama-cpp-localai-paged - !!merge <<: *llamacpplocalaipaged name: "nvidia-l4t-arm64-llama-cpp-localai-paged" uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp-localai-paged" diff --git a/scripts/build/llama-cpp-localai-paged-darwin.sh b/scripts/build/llama-cpp-localai-paged-darwin.sh new file mode 100755 index 000000000..9d205bd7e --- /dev/null +++ b/scripts/build/llama-cpp-localai-paged-darwin.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +set -ex + +# Darwin/Metal build for the llama-cpp-localai-paged backend. Mirrors +# scripts/build/llama-cpp-darwin.sh exactly, swapping the build dir, binary names, +# shared-lib dir and output tar for the paged wrapper. The paged wrapper Makefile +# (backend/cpp/llama-cpp-localai-paged) reuses backend/cpp/llama-cpp's CMakeLists +# /grpc-server with LLAMA_PAGED=on, so the Darwin/Metal path is identical: ggml +# CPU_ALL_VARIANTS + GGML_METAL=ON, and --target ggml pulls in ggml-metal via +# add_dependencies so the Metal GPU backend is produced as a loadable +# libggml-metal.dylib. The new paged GDN/conv ops have no Metal kernel, so a +# gated-DeltaNet (qwen35) model falls back to the CPU reference op at runtime +# (assert/fall-back is made SAFE by the fused-op backend gate, patch 0030); a +# non-qwen35 model gets the full paged-KV path on Metal. + +IMAGE_NAME="${IMAGE_NAME:-localai/llama-cpp-localai-paged-darwin}" + +pushd backend/cpp/llama-cpp-localai-paged + +# Single build via ggml CPU_ALL_VARIANTS: one binary plus the per-microarch Apple/arm +# dylibs (apple_m1/m2_m3/m4, armv8.x) that ggml selects at runtime. GGML_METAL stays ON +# and --target ggml also builds ggml-metal (via add_dependencies), so the Metal GPU +# backend is still produced as a loadable libggml-metal.dylib. +make llama-cpp-localai-paged-cpu-all && \ +make llama-cpp-localai-paged-grpc && \ +make llama-cpp-localai-paged-rpc-server + +popd + +mkdir -p build/darwin +mkdir -p backend-images +mkdir -p build/darwin/lib + +cp -rf backend/cpp/llama-cpp-localai-paged/llama-cpp-localai-paged-cpu-all build/darwin/ +cp -rf backend/cpp/llama-cpp-localai-paged/llama-cpp-localai-paged-grpc build/darwin/ +cp -rf backend/cpp/llama-cpp-localai-paged/llama-cpp-localai-paged-rpc-server build/darwin/ + +# Distribute the shared ggml/llama libraries from the CPU_ALL_VARIANTS build. Unlike the +# old fully-static fallback build, these have @rpath install names, so the otool loop below +# (which only copies deps that exist on disk) will not pick them up. The split is by suffix: +# - ggml emits its loadable backends (per-microarch CPU variants, metal, blas) with a .so +# suffix EVEN ON DARWIN. These go in the package ROOT next to the binary, because darwin +# run.sh execs the binary directly (no bundled ld.so) so ggml's executable-directory +# scan looks there. +# - the core libraries (libggml-base/libggml/libllama/libllama-common/libmtmd) use the +# platform .dylib suffix and are NEEDED deps; they go in lib/, resolved at load time via +# the DYLD_LIBRARY_PATH=lib that run.sh exports. -a preserves the version symlinks. +SHLIBS=backend/cpp/llama-cpp-localai-paged/ggml-shared-libs +cp -a $SHLIBS/*.so build/darwin/ +cp -a $SHLIBS/*.dylib build/darwin/lib/ + +# Set default additional libs only for Darwin on M chips (arm64) +if [[ "$(uname -s)" == "Darwin" && "$(uname -m)" == "arm64" ]]; then + ADDITIONAL_LIBS=${ADDITIONAL_LIBS:-$(ls /opt/homebrew/Cellar/protobuf/**/lib/libutf8_validity*.dylib 2>/dev/null)} +else + ADDITIONAL_LIBS=${ADDITIONAL_LIBS:-""} +fi + +for file in $ADDITIONAL_LIBS; do + cp -rfv $file build/darwin/lib +done + +for file in build/darwin/*; do + LIBS="$(otool -L $file | awk 'NR > 1 { system("echo " $1) } ' | xargs echo)" + for lib in $LIBS; do + # only libraries ending in dylib + if [[ "$lib" == *.dylib ]]; then + if [ -e "$lib" ]; then + cp -rvf "$lib" build/darwin/lib + fi + fi + done +done + +echo "--------------------------------" +echo "ADDITIONAL_LIBS: $ADDITIONAL_LIBS" +echo "--------------------------------" + +echo "Bundled libraries:" +ls -la build/darwin/lib + + +cp -rf backend/cpp/llama-cpp-localai-paged/run.sh build/darwin/ + +PLATFORMARCH="${PLATFORMARCH:-darwin/arm64}" + +./local-ai util create-oci-image \ + build/darwin/. \ + --output ./backend-images/llama-cpp-localai-paged.tar \ + --image-name $IMAGE_NAME \ + --platform $PLATFORMARCH + +rm -rf build/darwin diff --git a/scripts/changed-backends.js b/scripts/changed-backends.js index b908df8b9..61349b6c4 100644 --- a/scripts/changed-backends.js +++ b/scripts/changed-backends.js @@ -75,8 +75,10 @@ function inferBackendPathDarwin(item) { if (item.backend === "llama-cpp") { return `backend/cpp/llama-cpp/`; } - // llama-cpp-localai-paged on Darwin (if a metal row is ever added to - // includeDarwin) builds from the C++ sources under backend/cpp/llama-cpp-localai-paged. + // llama-cpp-localai-paged on Darwin (the -metal-darwin-arm64-llama-cpp-localai-paged + // includeDarwin row) builds from the C++ sources under + // backend/cpp/llama-cpp-localai-paged, like stock llama-cpp. The matrix entry + // carries lang=go for runner/toolchain selection, but the source is C++. if (item.backend === "llama-cpp-localai-paged") { return `backend/cpp/llama-cpp-localai-paged/`; }