ci(buun-llama-cpp): wire backend into test-extra + build matrix

Adds the buun-llama-cpp backend to the same CI pipelines that turboquant and sherpa-onnx already use: - scripts/changed-backends.js: path resolution for Dockerfile.buun-llama-cpp, plus fork-of-fork detection (changes under backend/cpp/llama-cpp/ also retrigger the buun pipeline, mirroring how turboquant is handled). - .github/workflows/test-extra.yml: detect-changes output and a new tests-buun-llama-cpp-grpc job that runs make test-extra-backend-buun-llama-cpp (turbo3 V-cache, same rationale as tests-turboquant-grpc). - .github/workflows/backend.yml: 9 matrix entries (CUDA 12/13, L4T CUDA 13 ARM64, ROCm, SYCL f32/f16, CPU, L4T ARM64, Vulkan) paired with each existing turboquant entry so image builds have platform parity. Also updates .agents/ai-coding-assistants.md to clarify that AI agents operating under the human submitter's git identity SHOULD emit Signed-off-by via `git commit -s` (never inventing or guessing another identity) — documents the workflow this PR is using. Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-04 06:44:48 -04:00 · 2026-04-24 12:52:44 +00:00
parent d6bf3a4969
commit 6233feb190
4 changed files with 181 additions and 12 deletions
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -399,6 +399,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.turboquant"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "8"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-12-buun-llama-cpp'
+            runs-on: 'bigger-runner'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "8"
@@ -894,6 +907,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.turboquant"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-buun-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: 'cublas'
            cuda-major-version: "13"
            cuda-minor-version: "0"
@@ -920,6 +946,19 @@ jobs:
            backend: "turboquant"
            dockerfile: "./backend/Dockerfile.turboquant"
            context: "./"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            skip-drivers: 'false'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-cuda-13-arm64-buun-llama-cpp'
+            base-image: "ubuntu:24.04"
+            runs-on: 'ubuntu-24.04-arm'
+            ubuntu-version: '2404'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "13"
            cuda-minor-version: "0"
@@ -1454,6 +1493,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.turboquant"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-rocm-hipblas-buun-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-24.04:7.2.1"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: 'hipblas'
            cuda-major-version: ""
            cuda-minor-version: ""
@@ -1703,6 +1755,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.turboquant"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'sycl_f32'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-intel-sycl-f32-buun-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: 'sycl_f16'
            cuda-major-version: ""
            cuda-minor-version: ""
@@ -1729,6 +1794,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.turboquant"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'sycl_f16'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-intel-sycl-f16-buun-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: 'intel'
            cuda-major-version: ""
            cuda-minor-version: ""
@@ -2134,6 +2212,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.turboquant"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-cpu-buun-llama-cpp'
+            runs-on: 'bigger-runner'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: ''
            cuda-major-version: ""
            cuda-minor-version: ""
@@ -2173,6 +2264,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.turboquant"
            context: "./"
            ubuntu-version: '2204'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            skip-drivers: 'false'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-arm64-buun-llama-cpp'
+            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+            runs-on: 'ubuntu-24.04-arm'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2204'
          - build-type: 'vulkan'
            cuda-major-version: ""
            cuda-minor-version: ""
@@ -2199,6 +2303,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.turboquant"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'vulkan'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-vulkan-buun-llama-cpp'
+            runs-on: 'bigger-runner'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
          # Stablediffusion-ggml
          - build-type: ''
            cuda-major-version: ""
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -32,6 +32,7 @@ jobs:
      llama-cpp: ${{ steps.detect.outputs.llama-cpp }}
      ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
      turboquant: ${{ steps.detect.outputs.turboquant }}
+      buun-llama-cpp: ${{ steps.detect.outputs['buun-llama-cpp'] }}
      vllm: ${{ steps.detect.outputs.vllm }}
      sglang: ${{ steps.detect.outputs.sglang }}
      acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
@@ -613,6 +614,30 @@ jobs:
      - name: Build turboquant backend image and run gRPC e2e tests
        run: |
          make test-extra-backend-turboquant
+  tests-buun-llama-cpp-grpc:
+    needs: detect-changes
+    if: needs.detect-changes.outputs['buun-llama-cpp'] == 'true' || needs.detect-changes.outputs.run-all == 'true'
+    runs-on: ubuntu-latest
+    timeout-minutes: 90
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          submodules: true
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.25.4'
+      # Exercises the buun-llama-cpp (fork-of-a-fork) backend with the
+      # fork-specific TurboQuant/TCQ KV-cache types. BACKEND_TEST_CACHE_TYPE_V
+      # is set to turbo3 so the test round-trips through the fork's KV
+      # allow-list — picking a stock llama.cpp type would only re-test the
+      # shared code path. DFlash speculative decoding is not exercised here
+      # because the one known public target/drafter pair (Qwen3.5-27B) is too
+      # large for CI.
+      - name: Build buun-llama-cpp backend image and run gRPC e2e tests
+        run: |
+          make test-extra-backend-buun-llama-cpp
  # tests-vllm-grpc is currently disabled in CI.
  #
  # The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16