feat(paged): restrict llama-cpp-localai-paged to CUDA-only build targets

The paged backend previously built for cublas/cuda, cpu, vulkan, sycl, hipblas and darwin/metal. On non-CUDA the patchset's wins are inert: the GDN fusions are gated off (patch 0030) and NVFP4 falls back to dequant, so the backend is neutral-to-negative there (README section 4c). The darwin grpc-server link also fails on undefined upstream server symbols, turning CI red. Both broken and pointless off-CUDA, so ship CUDA-only. - backend-matrix.yml: drop the hipblas, sycl f32/f16, cpu amd64/arm64, vulkan amd64/arm64 and metal-darwin rows for this backend; keep the four cublas rows (cuda-12, cuda-13, nvidia-l4t cuda-12 and cuda-13). - index.yaml: meta-backend (and -development) capabilities are now CUDA-only with default pointing at cuda12 (mirrors faster-qwen3-tts); removed the orphaned cpu/rocm/sycl/vulkan/metal variant entries. - Removed the now-unused darwin build script and its Makefile target / .NOTPARALLEL entry / backend_build_darwin.yml step. - Documented the CUDA-only build coverage in the patch README and plan. Non-CUDA users should use the stock llama-cpp backend. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-27 18:06:58 -04:00 · 2026-06-27 12:29:15 +00:00
parent 9115c2c52c
commit a4e730979d
7 changed files with 25 additions and 299 deletions
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -4928,78 +4928,6 @@ include:
    backend: "llama-cpp-localai-paged"
    dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
    context: "./"
-  - build-type: 'hipblas'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-rocm-hipblas-llama-cpp-localai-paged'
-    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-rocm-amd64'
-    runs-on: 'ubuntu-latest'
-    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
-    skip-drivers: 'false'
-    backend: "llama-cpp-localai-paged"
-    dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f32'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f32-llama-cpp-localai-paged'
-    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-intel-amd64'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.2-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "llama-cpp-localai-paged"
-    dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f16'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f16-llama-cpp-localai-paged'
-    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-intel-amd64'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "llama-cpp-localai-paged"
-    dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-llama-cpp-localai-paged'
-    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-amd64'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "llama-cpp-localai-paged"
-    dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-llama-cpp-localai-paged'
-    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-arm64'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "llama-cpp-localai-paged"
-    dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
-    context: "./"
-    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "0"
@@ -5014,36 +4942,6 @@ include:
    dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
    context: "./"
    ubuntu-version: '2204'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-llama-cpp-localai-paged'
-    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-vulkan-amd64'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "llama-cpp-localai-paged"
-    dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-llama-cpp-localai-paged'
-    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-vulkan-arm64'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "llama-cpp-localai-paged"
-    dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
-    context: "./"
-    ubuntu-version: '2404'

 # Darwin matrix (consumed by backend-jobs-darwin).
 includeDarwin:
@@ -5071,16 +4969,6 @@ includeDarwin:
  - backend: "llama-cpp"
    tag-suffix: "-metal-darwin-arm64-llama-cpp"
    lang: "go"
-  # llama-cpp-localai-paged on Darwin: same bespoke CPU_ALL_VARIANTS + Metal build
-  # as stock llama-cpp (driven by make backends/llama-cpp-localai-paged-darwin),
-  # reusing backend/cpp/llama-cpp sources, with the paged patch series applied by the wrapper. lang=go selects the
-  # runner/toolchain only; the source path is C++. Metal delivers paged-KV (the
-  # NVFP4 FP4-MMA fast path is CUDA/Blackwell-only) and the GDN/conv fused ops have
-  # no Metal kernel, so a gated-DeltaNet (qwen35) model falls back to the CPU
-  # reference op at runtime (made safe by the fused-op backend gate, patch 0030).
-  - backend: "llama-cpp-localai-paged"
-    tag-suffix: "-metal-darwin-arm64-llama-cpp-localai-paged"
-    lang: "go"
  - backend: "stablediffusion-ggml"
    tag-suffix: "-metal-darwin-arm64-stablediffusion-ggml"
    build-type: "metal"
--- a/.github/workflows/backend_build_darwin.yml
+++ b/.github/workflows/backend_build_darwin.yml
@@ -230,16 +230,6 @@ jobs:
          make protogen-go
          make backends/llama-cpp-darwin

-      # llama-cpp-localai-paged reuses the same bespoke llama-cpp darwin build path
-      # (CPU_ALL_VARIANTS + Metal + otool dylib bundling) via its own wrapper script,
-      # so it gets a dedicated step like stock llama-cpp rather than the generic
-      # build-darwin-go-backend mold.
-      - name: Build ${{ inputs.backend }}-darwin (llama-cpp-localai-paged)
-        if: inputs.backend == 'llama-cpp-localai-paged'
-        run: |
-          make protogen-go
-          make backends/llama-cpp-localai-paged-darwin
-
      - name: Build ds4 backend (Darwin Metal)
        if: inputs.backend == 'ds4'
        run: |
@@ -255,7 +245,7 @@ jobs:
          make backends/privacy-filter-darwin

      - name: Build ${{ inputs.backend }}-darwin
-        if: inputs.backend != 'llama-cpp' && inputs.backend != 'llama-cpp-localai-paged' && inputs.backend != 'ds4' && inputs.backend != 'privacy-filter'
+        if: inputs.backend != 'llama-cpp' && inputs.backend != 'ds4' && inputs.backend != 'privacy-filter'
        run: |
          make protogen-go
          BACKEND=${{ inputs.backend }} BUILD_TYPE=${{ inputs.build-type }} USE_PIP=${{ inputs.use-pip }} make build-darwin-${{ inputs.lang }}-backend