mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-27 18:06:58 -04:00
feat(paged): restrict llama-cpp-localai-paged to CUDA-only build targets
The paged backend previously built for cublas/cuda, cpu, vulkan, sycl, hipblas and darwin/metal. On non-CUDA the patchset's wins are inert: the GDN fusions are gated off (patch 0030) and NVFP4 falls back to dequant, so the backend is neutral-to-negative there (README section 4c). The darwin grpc-server link also fails on undefined upstream server symbols, turning CI red. Both broken and pointless off-CUDA, so ship CUDA-only. - backend-matrix.yml: drop the hipblas, sycl f32/f16, cpu amd64/arm64, vulkan amd64/arm64 and metal-darwin rows for this backend; keep the four cublas rows (cuda-12, cuda-13, nvidia-l4t cuda-12 and cuda-13). - index.yaml: meta-backend (and -development) capabilities are now CUDA-only with default pointing at cuda12 (mirrors faster-qwen3-tts); removed the orphaned cpu/rocm/sycl/vulkan/metal variant entries. - Removed the now-unused darwin build script and its Makefile target / .NOTPARALLEL entry / backend_build_darwin.yml step. - Documented the CUDA-only build coverage in the patch README and plan. Non-CUDA users should use the stock llama-cpp backend. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
112
.github/backend-matrix.yml
vendored
112
.github/backend-matrix.yml
vendored
@@ -4928,78 +4928,6 @@ include:
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
- build-type: 'hipblas'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-rocm-hipblas-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-rocm-amd64'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'sycl_f32'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sycl-f32-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-intel-amd64'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "intel/oneapi-basekit:2025.3.2-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'sycl_f16'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sycl-f16-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-intel-amd64'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-amd64'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-arm64'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
@@ -5014,36 +4942,6 @@ include:
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2204'
|
||||
- build-type: 'vulkan'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-vulkan-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-vulkan-amd64'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'vulkan'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-vulkan-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-vulkan-arm64'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
|
||||
# Darwin matrix (consumed by backend-jobs-darwin).
|
||||
includeDarwin:
|
||||
@@ -5071,16 +4969,6 @@ includeDarwin:
|
||||
- backend: "llama-cpp"
|
||||
tag-suffix: "-metal-darwin-arm64-llama-cpp"
|
||||
lang: "go"
|
||||
# llama-cpp-localai-paged on Darwin: same bespoke CPU_ALL_VARIANTS + Metal build
|
||||
# as stock llama-cpp (driven by make backends/llama-cpp-localai-paged-darwin),
|
||||
# reusing backend/cpp/llama-cpp sources, with the paged patch series applied by the wrapper. lang=go selects the
|
||||
# runner/toolchain only; the source path is C++. Metal delivers paged-KV (the
|
||||
# NVFP4 FP4-MMA fast path is CUDA/Blackwell-only) and the GDN/conv fused ops have
|
||||
# no Metal kernel, so a gated-DeltaNet (qwen35) model falls back to the CPU
|
||||
# reference op at runtime (made safe by the fused-op backend gate, patch 0030).
|
||||
- backend: "llama-cpp-localai-paged"
|
||||
tag-suffix: "-metal-darwin-arm64-llama-cpp-localai-paged"
|
||||
lang: "go"
|
||||
- backend: "stablediffusion-ggml"
|
||||
tag-suffix: "-metal-darwin-arm64-stablediffusion-ggml"
|
||||
build-type: "metal"
|
||||
|
||||
12
.github/workflows/backend_build_darwin.yml
vendored
12
.github/workflows/backend_build_darwin.yml
vendored
@@ -230,16 +230,6 @@ jobs:
|
||||
make protogen-go
|
||||
make backends/llama-cpp-darwin
|
||||
|
||||
# llama-cpp-localai-paged reuses the same bespoke llama-cpp darwin build path
|
||||
# (CPU_ALL_VARIANTS + Metal + otool dylib bundling) via its own wrapper script,
|
||||
# so it gets a dedicated step like stock llama-cpp rather than the generic
|
||||
# build-darwin-go-backend mold.
|
||||
- name: Build ${{ inputs.backend }}-darwin (llama-cpp-localai-paged)
|
||||
if: inputs.backend == 'llama-cpp-localai-paged'
|
||||
run: |
|
||||
make protogen-go
|
||||
make backends/llama-cpp-localai-paged-darwin
|
||||
|
||||
- name: Build ds4 backend (Darwin Metal)
|
||||
if: inputs.backend == 'ds4'
|
||||
run: |
|
||||
@@ -255,7 +245,7 @@ jobs:
|
||||
make backends/privacy-filter-darwin
|
||||
|
||||
- name: Build ${{ inputs.backend }}-darwin
|
||||
if: inputs.backend != 'llama-cpp' && inputs.backend != 'llama-cpp-localai-paged' && inputs.backend != 'ds4' && inputs.backend != 'privacy-filter'
|
||||
if: inputs.backend != 'llama-cpp' && inputs.backend != 'ds4' && inputs.backend != 'privacy-filter'
|
||||
run: |
|
||||
make protogen-go
|
||||
BACKEND=${{ inputs.backend }} BUILD_TYPE=${{ inputs.build-type }} USE_PIP=${{ inputs.use-pip }} make build-darwin-${{ inputs.lang }}-backend
|
||||
|
||||
Reference in New Issue
Block a user