From df86e8d6d4b14518d6cc79948845afe495649101 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 6 Jun 2026 22:43:47 +0000 Subject: [PATCH] ci(turboquant): drop the ROCm/hipblas build flavor The TheTom/llama-cpp-turboquant fork is not ROCm-clean at the current pin: beyond the CUDA-API gaps already patched (3D-peer copy, cudaEventCreate), its llama.cpp base fails to compile the flash-attention MMA f16 kernels for head-dim 640 under HIP (cols_per_warp evaluates to 0 -> division-by-zero / non-constant static asserts in fattn-mma-f16.cuh). That is a deep ggml-on-ROCm kernel issue, not something a small fork patch can paper over. Drop -gpu-rocm-hipblas-turboquant from the build matrix so turboquant still ships for cpu / cublas / vulkan / sycl. Re-add it once the fork's HIP path compiles (or upstream ggml fixes the large-head-dim MMA kernels for ROCm). Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] --- .github/backend-matrix.yml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index 9d7e36259..464ffc36c 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -1766,20 +1766,6 @@ include: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2404' - - build-type: 'hipblas' - cuda-major-version: "" - cuda-minor-version: "" - platforms: 'linux/amd64' - tag-latest: 'auto' - tag-suffix: '-gpu-rocm-hipblas-turboquant' - builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-rocm-amd64' - runs-on: 'ubuntu-latest' - base-image: "rocm/dev-ubuntu-24.04:7.2.1" - skip-drivers: 'false' - backend: "turboquant" - dockerfile: "./backend/Dockerfile.turboquant" - context: "./" - ubuntu-version: '2404' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: ""