mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-27 09:57:14 -04:00
feat(paged): restrict llama-cpp-localai-paged to CUDA-only build targets
The paged backend previously built for cublas/cuda, cpu, vulkan, sycl, hipblas and darwin/metal. On non-CUDA the patchset's wins are inert: the GDN fusions are gated off (patch 0030) and NVFP4 falls back to dequant, so the backend is neutral-to-negative there (README section 4c). The darwin grpc-server link also fails on undefined upstream server symbols, turning CI red. Both broken and pointless off-CUDA, so ship CUDA-only. - backend-matrix.yml: drop the hipblas, sycl f32/f16, cpu amd64/arm64, vulkan amd64/arm64 and metal-darwin rows for this backend; keep the four cublas rows (cuda-12, cuda-13, nvidia-l4t cuda-12 and cuda-13). - index.yaml: meta-backend (and -development) capabilities are now CUDA-only with default pointing at cuda12 (mirrors faster-qwen3-tts); removed the orphaned cpu/rocm/sycl/vulkan/metal variant entries. - Removed the now-unused darwin build script and its Makefile target / .NOTPARALLEL entry / backend_build_darwin.yml step. - Documented the CUDA-only build coverage in the patch README and plan. Non-CUDA users should use the stock llama-cpp backend. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
112
.github/backend-matrix.yml
vendored
112
.github/backend-matrix.yml
vendored
@@ -4928,78 +4928,6 @@ include:
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
- build-type: 'hipblas'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-rocm-hipblas-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-rocm-amd64'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'sycl_f32'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sycl-f32-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-intel-amd64'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "intel/oneapi-basekit:2025.3.2-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'sycl_f16'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sycl-f16-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-intel-amd64'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-amd64'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-arm64'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
@@ -5014,36 +4942,6 @@ include:
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2204'
|
||||
- build-type: 'vulkan'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-vulkan-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-vulkan-amd64'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'vulkan'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-vulkan-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-vulkan-arm64'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
|
||||
# Darwin matrix (consumed by backend-jobs-darwin).
|
||||
includeDarwin:
|
||||
@@ -5071,16 +4969,6 @@ includeDarwin:
|
||||
- backend: "llama-cpp"
|
||||
tag-suffix: "-metal-darwin-arm64-llama-cpp"
|
||||
lang: "go"
|
||||
# llama-cpp-localai-paged on Darwin: same bespoke CPU_ALL_VARIANTS + Metal build
|
||||
# as stock llama-cpp (driven by make backends/llama-cpp-localai-paged-darwin),
|
||||
# reusing backend/cpp/llama-cpp sources, with the paged patch series applied by the wrapper. lang=go selects the
|
||||
# runner/toolchain only; the source path is C++. Metal delivers paged-KV (the
|
||||
# NVFP4 FP4-MMA fast path is CUDA/Blackwell-only) and the GDN/conv fused ops have
|
||||
# no Metal kernel, so a gated-DeltaNet (qwen35) model falls back to the CPU
|
||||
# reference op at runtime (made safe by the fused-op backend gate, patch 0030).
|
||||
- backend: "llama-cpp-localai-paged"
|
||||
tag-suffix: "-metal-darwin-arm64-llama-cpp-localai-paged"
|
||||
lang: "go"
|
||||
- backend: "stablediffusion-ggml"
|
||||
tag-suffix: "-metal-darwin-arm64-stablediffusion-ggml"
|
||||
build-type: "metal"
|
||||
|
||||
12
.github/workflows/backend_build_darwin.yml
vendored
12
.github/workflows/backend_build_darwin.yml
vendored
@@ -230,16 +230,6 @@ jobs:
|
||||
make protogen-go
|
||||
make backends/llama-cpp-darwin
|
||||
|
||||
# llama-cpp-localai-paged reuses the same bespoke llama-cpp darwin build path
|
||||
# (CPU_ALL_VARIANTS + Metal + otool dylib bundling) via its own wrapper script,
|
||||
# so it gets a dedicated step like stock llama-cpp rather than the generic
|
||||
# build-darwin-go-backend mold.
|
||||
- name: Build ${{ inputs.backend }}-darwin (llama-cpp-localai-paged)
|
||||
if: inputs.backend == 'llama-cpp-localai-paged'
|
||||
run: |
|
||||
make protogen-go
|
||||
make backends/llama-cpp-localai-paged-darwin
|
||||
|
||||
- name: Build ds4 backend (Darwin Metal)
|
||||
if: inputs.backend == 'ds4'
|
||||
run: |
|
||||
@@ -255,7 +245,7 @@ jobs:
|
||||
make backends/privacy-filter-darwin
|
||||
|
||||
- name: Build ${{ inputs.backend }}-darwin
|
||||
if: inputs.backend != 'llama-cpp' && inputs.backend != 'llama-cpp-localai-paged' && inputs.backend != 'ds4' && inputs.backend != 'privacy-filter'
|
||||
if: inputs.backend != 'llama-cpp' && inputs.backend != 'ds4' && inputs.backend != 'privacy-filter'
|
||||
run: |
|
||||
make protogen-go
|
||||
BACKEND=${{ inputs.backend }} BUILD_TYPE=${{ inputs.build-type }} USE_PIP=${{ inputs.use-pip }} make build-darwin-${{ inputs.lang }}-backend
|
||||
|
||||
9
Makefile
9
Makefile
@@ -1,5 +1,5 @@
|
||||
# Disable parallel execution for backend builds
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio backends/supertonic backends/depth-anything-cpp backends/privacy-filter backends/privacy-filter-darwin backends/llama-cpp-localai-paged backends/llama-cpp-localai-paged-darwin
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio backends/supertonic backends/depth-anything-cpp backends/privacy-filter backends/privacy-filter-darwin backends/llama-cpp-localai-paged
|
||||
|
||||
GOCMD=go
|
||||
GOTEST=$(GOCMD) test
|
||||
@@ -1141,13 +1141,6 @@ backends/llama-cpp-darwin: build
|
||||
bash ./scripts/build/llama-cpp-darwin.sh
|
||||
./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
|
||||
|
||||
# llama-cpp-localai-paged on Darwin: same bespoke CPU_ALL_VARIANTS + Metal build as
|
||||
# stock llama-cpp (otool dylib bundling), driven through the paged wrapper Makefile,
|
||||
# which applies its own vendored paged patch series. Mirrors backends/llama-cpp-darwin.
|
||||
backends/llama-cpp-localai-paged-darwin: build
|
||||
bash ./scripts/build/llama-cpp-localai-paged-darwin.sh
|
||||
./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp-localai-paged.tar)"
|
||||
|
||||
backends/ds4-darwin: build
|
||||
bash ./scripts/build/ds4-darwin.sh
|
||||
./local-ai backends install "ocifile://$(abspath ./backend-images/ds4.tar)"
|
||||
|
||||
@@ -3,6 +3,13 @@
|
||||
Scoping deliverable only. NOTHING is changed by this document. It is grounded in the
|
||||
actual repo structure (read 2026-06-26 in worktree feat+paged-attention), not assumptions.
|
||||
|
||||
SHIPPED REALITY (update 2026-06-27): the backend ships CUDA-only. The matrix rows and
|
||||
the index.yaml meta-backend keep ONLY the CUDA/cublas variants (cuda-12, cuda-13, and
|
||||
the nvidia-l4t arm64 cuda-12/cuda-13 Jetson rows). The cpu / vulkan / sycl / hipblas /
|
||||
metal-darwin variants discussed below as optional/phase-2 were NOT shipped (and the
|
||||
darwin row was removed): off-CUDA the patchset's wins gate off, so it is neutral-to-
|
||||
negative there and non-CUDA users should use the stock llama-cpp backend (README 4c).
|
||||
|
||||
================================================================================
|
||||
0. GROUND TRUTH (what the repo actually does today)
|
||||
================================================================================
|
||||
|
||||
@@ -344,6 +344,14 @@ in a recommended/gallery config.
|
||||
|
||||
## 8. Models
|
||||
|
||||
> **Build coverage: CUDA-only.** This backend ships only the CUDA/cublas build
|
||||
> targets (cuda-12, cuda-13, and the nvidia-l4t arm64 cuda-12/cuda-13 Jetson
|
||||
> rows). There are no cpu / vulkan / sycl / hipblas / metal-darwin builds: the
|
||||
> patchset's wins are CUDA/Blackwell-specific (section 4c), so off-CUDA the
|
||||
> backend is neutral-to-negative and non-CUDA users should run the stock
|
||||
> `llama-cpp` backend instead. The `backend/index.yaml` meta-backend resolves
|
||||
> `default`/`nvidia` to a CUDA variant accordingly.
|
||||
|
||||
The benchmarked NVFP4 GGUFs are published and wired into the LocalAI gallery:
|
||||
|
||||
| Gallery entry | Weights (HuggingFace) | Notes |
|
||||
|
||||
@@ -92,20 +92,18 @@
|
||||
tags:
|
||||
- text-to-text
|
||||
- LLM
|
||||
- CPU
|
||||
- GPU
|
||||
- Metal
|
||||
- CUDA
|
||||
- HIP
|
||||
- paged-attention
|
||||
- nvfp4
|
||||
# CUDA-only: the paged patchset's wins (GDN fusions, NVFP4 FP4-MMA) are
|
||||
# CUDA/Blackwell-specific; off-CUDA they gate off and the backend is
|
||||
# neutral-to-negative, so non-CUDA users should use the stock llama-cpp
|
||||
# backend. default points at cuda12 (mirrors faster-qwen3-tts) so the gallery
|
||||
# entries always resolve to a CUDA variant.
|
||||
capabilities:
|
||||
default: "cpu-llama-cpp-localai-paged"
|
||||
default: "cuda12-llama-cpp-localai-paged"
|
||||
nvidia: "cuda12-llama-cpp-localai-paged"
|
||||
intel: "intel-sycl-f16-llama-cpp-localai-paged"
|
||||
amd: "rocm-llama-cpp-localai-paged"
|
||||
metal: "metal-llama-cpp-localai-paged"
|
||||
vulkan: "vulkan-llama-cpp-localai-paged"
|
||||
nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-localai-paged"
|
||||
nvidia-cuda-13: "cuda13-llama-cpp-localai-paged"
|
||||
nvidia-cuda-12: "cuda12-llama-cpp-localai-paged"
|
||||
@@ -1681,11 +1679,8 @@
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "llama-cpp-localai-paged-development"
|
||||
capabilities:
|
||||
default: "cpu-llama-cpp-localai-paged-development"
|
||||
default: "cuda12-llama-cpp-localai-paged-development"
|
||||
nvidia: "cuda12-llama-cpp-localai-paged-development"
|
||||
intel: "intel-sycl-f16-llama-cpp-localai-paged-development"
|
||||
amd: "rocm-llama-cpp-localai-paged-development"
|
||||
vulkan: "vulkan-llama-cpp-localai-paged-development"
|
||||
nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-localai-paged-development"
|
||||
nvidia-cuda-13: "cuda13-llama-cpp-localai-paged-development"
|
||||
nvidia-cuda-12: "cuda12-llama-cpp-localai-paged-development"
|
||||
@@ -2359,17 +2354,7 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-turboquant"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-turboquant
|
||||
## llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "cpu-llama-cpp-localai-paged"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-cpu-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "cpu-llama-cpp-localai-paged-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-llama-cpp-localai-paged
|
||||
## llama-cpp-localai-paged (CUDA-only; see backend/cpp/llama-cpp-localai-paged/patches/paged/README.md section 4c)
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "cuda12-llama-cpp-localai-paged"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-localai-paged"
|
||||
@@ -2390,56 +2375,6 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "rocm-llama-cpp-localai-paged"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-rocm-hipblas-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "rocm-llama-cpp-localai-paged-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-rocm-hipblas-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "intel-sycl-f32-llama-cpp-localai-paged"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-intel-sycl-f32-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "intel-sycl-f32-llama-cpp-localai-paged-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-intel-sycl-f32-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "intel-sycl-f16-llama-cpp-localai-paged"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-intel-sycl-f16-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "intel-sycl-f16-llama-cpp-localai-paged-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-intel-sycl-f16-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "vulkan-llama-cpp-localai-paged"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-vulkan-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "vulkan-llama-cpp-localai-paged-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-vulkan-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "metal-llama-cpp-localai-paged"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "metal-llama-cpp-localai-paged-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "nvidia-l4t-arm64-llama-cpp-localai-paged"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp-localai-paged"
|
||||
|
||||
@@ -1,95 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Darwin/Metal build for the llama-cpp-localai-paged backend. Mirrors
|
||||
# scripts/build/llama-cpp-darwin.sh exactly, swapping the build dir, binary names,
|
||||
# shared-lib dir and output tar for the paged wrapper. The paged wrapper Makefile
|
||||
# (backend/cpp/llama-cpp-localai-paged) reuses backend/cpp/llama-cpp's CMakeLists
|
||||
# /grpc-server and applies its own vendored paged patch series (patches/paged/)
|
||||
# onto the cloned tree, so the Darwin/Metal path is identical: ggml
|
||||
# CPU_ALL_VARIANTS + GGML_METAL=ON, and --target ggml pulls in ggml-metal via
|
||||
# add_dependencies so the Metal GPU backend is produced as a loadable
|
||||
# libggml-metal.dylib. The new paged GDN/conv ops have no Metal kernel, so a
|
||||
# gated-DeltaNet (qwen35) model falls back to the CPU reference op at runtime
|
||||
# (assert/fall-back is made SAFE by the fused-op backend gate, patch 0030); a
|
||||
# non-qwen35 model gets the full paged-KV path on Metal.
|
||||
|
||||
IMAGE_NAME="${IMAGE_NAME:-localai/llama-cpp-localai-paged-darwin}"
|
||||
|
||||
pushd backend/cpp/llama-cpp-localai-paged
|
||||
|
||||
# Single build via ggml CPU_ALL_VARIANTS: one binary plus the per-microarch Apple/arm
|
||||
# dylibs (apple_m1/m2_m3/m4, armv8.x) that ggml selects at runtime. GGML_METAL stays ON
|
||||
# and --target ggml also builds ggml-metal (via add_dependencies), so the Metal GPU
|
||||
# backend is still produced as a loadable libggml-metal.dylib.
|
||||
make llama-cpp-localai-paged-cpu-all && \
|
||||
make llama-cpp-localai-paged-grpc && \
|
||||
make llama-cpp-localai-paged-rpc-server
|
||||
|
||||
popd
|
||||
|
||||
mkdir -p build/darwin
|
||||
mkdir -p backend-images
|
||||
mkdir -p build/darwin/lib
|
||||
|
||||
cp -rf backend/cpp/llama-cpp-localai-paged/llama-cpp-localai-paged-cpu-all build/darwin/
|
||||
cp -rf backend/cpp/llama-cpp-localai-paged/llama-cpp-localai-paged-grpc build/darwin/
|
||||
cp -rf backend/cpp/llama-cpp-localai-paged/llama-cpp-localai-paged-rpc-server build/darwin/
|
||||
|
||||
# Distribute the shared ggml/llama libraries from the CPU_ALL_VARIANTS build. Unlike the
|
||||
# old fully-static fallback build, these have @rpath install names, so the otool loop below
|
||||
# (which only copies deps that exist on disk) will not pick them up. The split is by suffix:
|
||||
# - ggml emits its loadable backends (per-microarch CPU variants, metal, blas) with a .so
|
||||
# suffix EVEN ON DARWIN. These go in the package ROOT next to the binary, because darwin
|
||||
# run.sh execs the binary directly (no bundled ld.so) so ggml's executable-directory
|
||||
# scan looks there.
|
||||
# - the core libraries (libggml-base/libggml/libllama/libllama-common/libmtmd) use the
|
||||
# platform .dylib suffix and are NEEDED deps; they go in lib/, resolved at load time via
|
||||
# the DYLD_LIBRARY_PATH=lib that run.sh exports. -a preserves the version symlinks.
|
||||
SHLIBS=backend/cpp/llama-cpp-localai-paged/ggml-shared-libs
|
||||
cp -a $SHLIBS/*.so build/darwin/
|
||||
cp -a $SHLIBS/*.dylib build/darwin/lib/
|
||||
|
||||
# Set default additional libs only for Darwin on M chips (arm64)
|
||||
if [[ "$(uname -s)" == "Darwin" && "$(uname -m)" == "arm64" ]]; then
|
||||
ADDITIONAL_LIBS=${ADDITIONAL_LIBS:-$(ls /opt/homebrew/Cellar/protobuf/**/lib/libutf8_validity*.dylib 2>/dev/null)}
|
||||
else
|
||||
ADDITIONAL_LIBS=${ADDITIONAL_LIBS:-""}
|
||||
fi
|
||||
|
||||
for file in $ADDITIONAL_LIBS; do
|
||||
cp -rfv $file build/darwin/lib
|
||||
done
|
||||
|
||||
for file in build/darwin/*; do
|
||||
LIBS="$(otool -L $file | awk 'NR > 1 { system("echo " $1) } ' | xargs echo)"
|
||||
for lib in $LIBS; do
|
||||
# only libraries ending in dylib
|
||||
if [[ "$lib" == *.dylib ]]; then
|
||||
if [ -e "$lib" ]; then
|
||||
cp -rvf "$lib" build/darwin/lib
|
||||
fi
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
echo "--------------------------------"
|
||||
echo "ADDITIONAL_LIBS: $ADDITIONAL_LIBS"
|
||||
echo "--------------------------------"
|
||||
|
||||
echo "Bundled libraries:"
|
||||
ls -la build/darwin/lib
|
||||
|
||||
|
||||
cp -rf backend/cpp/llama-cpp-localai-paged/run.sh build/darwin/
|
||||
|
||||
PLATFORMARCH="${PLATFORMARCH:-darwin/arm64}"
|
||||
|
||||
./local-ai util create-oci-image \
|
||||
build/darwin/. \
|
||||
--output ./backend-images/llama-cpp-localai-paged.tar \
|
||||
--image-name $IMAGE_NAME \
|
||||
--platform $PLATFORMARCH
|
||||
|
||||
rm -rf build/darwin
|
||||
Reference in New Issue
Block a user