From 9bb8994c4e5f814b81f9d79a6a9293f0e75dd241 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 28 Jun 2026 01:37:54 +0000 Subject: [PATCH] chore(paged): drop CUDA-12 variants of llama-cpp-localai-paged, keep CUDA-13 only The paged backend targets Blackwell sm_121a, which CUDA 12.0 cannot target at all, so the CUDA-12 variants were pointless. They were also broken: the cublas-12 / nvidia-l4t / arm64 build failed to compile paged-kv-manager.cpp ("no declaration matches ...", a ~10-function mismatch the older cuda-12-base gcc rejects). CUDA-13 compiles it fine (confirmed on GB10). Removed (config-only, scoped to the paged backend): - backend-matrix.yml: the two CUDA-12 paged rows (-gpu-nvidia-cuda-12-llama-cpp-localai-paged, -nvidia-l4t-arm64-llama-cpp-localai-paged) - backend/index.yaml: CUDA-12 capability keys (nvidia-cuda-12, nvidia-l4t-cuda-12, nvidia-l4t) on both meta-backends, repointed default/nvidia to the cuda13 amd64 variant, and dropped the orphaned cuda12-* / nvidia-l4t-arm64-* variant definitions (latest + -development). Kept CUDA-13 only: cuda13-llama-cpp-localai-paged (amd64) and cuda13-nvidia-l4t-arm64-llama-cpp-localai-paged (l4t arm64). Matrix tag-suffixes <-> index variant URIs form a clean 2:2 bijection. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- .github/backend-matrix.yml | 28 ---------------------------- backend/index.yaml | 34 ++++------------------------------ 2 files changed, 4 insertions(+), 58 deletions(-) diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index a616b48f8..497205e80 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -4886,20 +4886,6 @@ include: # swapped; builder-base-image is left UNCHANGED so these reuse the same # base-grpc-* prebuilt bases (same gRPC + same toolchain), needing no new # base-images.yml variant. - - build-type: 'cublas' - cuda-major-version: "12" - cuda-minor-version: "8" - platforms: 'linux/amd64' - tag-latest: 'auto' - tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp-localai-paged' - builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-12-amd64' - runs-on: 'bigger-runner' - base-image: "ubuntu:24.04" - skip-drivers: 'false' - backend: "llama-cpp-localai-paged" - dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged" - context: "./" - ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -4928,20 +4914,6 @@ include: backend: "llama-cpp-localai-paged" dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged" context: "./" - - build-type: 'cublas' - cuda-major-version: "12" - cuda-minor-version: "0" - platforms: 'linux/arm64' - skip-drivers: 'false' - tag-latest: 'auto' - tag-suffix: '-nvidia-l4t-arm64-llama-cpp-localai-paged' - builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-l4t-cuda-12-arm64' - base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" - runs-on: 'ubuntu-24.04-arm' - backend: "llama-cpp-localai-paged" - dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged" - context: "./" - ubuntu-version: '2204' # Darwin matrix (consumed by backend-jobs-darwin). includeDarwin: diff --git a/backend/index.yaml b/backend/index.yaml index db29d621a..b612bbc1a 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -102,12 +102,9 @@ # backend. default points at cuda12 (mirrors faster-qwen3-tts) so the gallery # entries always resolve to a CUDA variant. capabilities: - default: "cuda12-llama-cpp-localai-paged" - nvidia: "cuda12-llama-cpp-localai-paged" - nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-localai-paged" + default: "cuda13-llama-cpp-localai-paged" + nvidia: "cuda13-llama-cpp-localai-paged" nvidia-cuda-13: "cuda13-llama-cpp-localai-paged" - nvidia-cuda-12: "cuda12-llama-cpp-localai-paged" - nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-localai-paged" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-localai-paged" - &ds4 name: "ds4" @@ -1679,12 +1676,9 @@ - !!merge <<: *llamacpplocalaipaged name: "llama-cpp-localai-paged-development" capabilities: - default: "cuda12-llama-cpp-localai-paged-development" - nvidia: "cuda12-llama-cpp-localai-paged-development" - nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-localai-paged-development" + default: "cuda13-llama-cpp-localai-paged-development" + nvidia: "cuda13-llama-cpp-localai-paged-development" nvidia-cuda-13: "cuda13-llama-cpp-localai-paged-development" - nvidia-cuda-12: "cuda12-llama-cpp-localai-paged-development" - nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-localai-paged-development" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-localai-paged-development" - !!merge <<: *ds4 name: "ds4-development" @@ -2355,16 +2349,6 @@ mirrors: - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-turboquant ## llama-cpp-localai-paged (CUDA-only; see backend/cpp/llama-cpp-localai-paged/README.md section 4c) -- !!merge <<: *llamacpplocalaipaged - name: "cuda12-llama-cpp-localai-paged" - uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-localai-paged" - mirrors: - - localai/localai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-localai-paged -- !!merge <<: *llamacpplocalaipaged - name: "cuda12-llama-cpp-localai-paged-development" - uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-llama-cpp-localai-paged" - mirrors: - - localai/localai-backends:master-gpu-nvidia-cuda-12-llama-cpp-localai-paged - !!merge <<: *llamacpplocalaipaged name: "cuda13-llama-cpp-localai-paged" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-localai-paged" @@ -2375,16 +2359,6 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp-localai-paged" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp-localai-paged -- !!merge <<: *llamacpplocalaipaged - name: "nvidia-l4t-arm64-llama-cpp-localai-paged" - uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp-localai-paged" - mirrors: - - localai/localai-backends:latest-nvidia-l4t-arm64-llama-cpp-localai-paged -- !!merge <<: *llamacpplocalaipaged - name: "nvidia-l4t-arm64-llama-cpp-localai-paged-development" - uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-llama-cpp-localai-paged" - mirrors: - - localai/localai-backends:master-nvidia-l4t-arm64-llama-cpp-localai-paged - !!merge <<: *llamacpplocalaipaged name: "cuda13-nvidia-l4t-arm64-llama-cpp-localai-paged" uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-localai-paged"