From 6233feb19013faa327c785a41c62b19e5dfc43b2 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 24 Apr 2026 12:52:44 +0000 Subject: [PATCH] ci(buun-llama-cpp): wire backend into test-extra + build matrix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the buun-llama-cpp backend to the same CI pipelines that turboquant and sherpa-onnx already use: - scripts/changed-backends.js: path resolution for Dockerfile.buun-llama-cpp, plus fork-of-fork detection (changes under backend/cpp/llama-cpp/ also retrigger the buun pipeline, mirroring how turboquant is handled). - .github/workflows/test-extra.yml: detect-changes output and a new tests-buun-llama-cpp-grpc job that runs make test-extra-backend-buun-llama-cpp (turbo3 V-cache, same rationale as tests-turboquant-grpc). - .github/workflows/backend.yml: 9 matrix entries (CUDA 12/13, L4T CUDA 13 ARM64, ROCm, SYCL f32/f16, CPU, L4T ARM64, Vulkan) paired with each existing turboquant entry so image builds have platform parity. Also updates .agents/ai-coding-assistants.md to clarify that AI agents operating under the human submitter's git identity SHOULD emit Signed-off-by via `git commit -s` (never inventing or guessing another identity) — documents the workflow this PR is using. Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Ettore Di Giacinto --- .agents/ai-coding-assistants.md | 38 +++++++--- .github/workflows/backend.yml | 117 +++++++++++++++++++++++++++++++ .github/workflows/test-extra.yml | 25 +++++++ scripts/changed-backends.js | 13 +++- 4 files changed, 181 insertions(+), 12 deletions(-) diff --git a/.agents/ai-coding-assistants.md b/.agents/ai-coding-assistants.md index d0d9c882c..0f94c70cf 100644 --- a/.agents/ai-coding-assistants.md +++ b/.agents/ai-coding-assistants.md @@ -35,19 +35,33 @@ All contributions must comply with LocalAI's licensing requirements: ## Signed-off-by and Developer Certificate of Origin -**AI agents MUST NOT add `Signed-off-by` tags.** Only humans can legally -certify the Developer Certificate of Origin (DCO). The human submitter -is responsible for: +Only humans can certify the Developer Certificate of Origin (DCO). AI +agents MUST NOT invent or guess a human identity for `Signed-off-by` — +doing so forges the DCO certification. -- Reviewing all AI-generated code +However, when a human operator explicitly directs the AI to commit on +their behalf, the AI is acting as a typing tool — no different from an +editor macro or `git commit -s`. In that case the AI SHOULD add +`Signed-off-by:` using the **configured `user.name` / `user.email`** of +the current git repository (i.e. the operator's own identity). The +resulting trailer is the operator's signature; they take responsibility +for it by reviewing and pushing the commit. The AI MUST NOT use any +other identity and MUST NOT add its own name to the sign-off. + +When running `git commit`, prefer `git commit --signoff` (or `-s`) so +the trailer is emitted by git itself from the configured identity, +rather than hand-writing it in a heredoc — this guarantees the sign-off +matches whatever identity the operator is currently using. + +The human submitter remains responsible for: + +- Reviewing all AI-generated code before it's pushed or merged - Ensuring compliance with licensing requirements -- Adding their own `Signed-off-by` tag (when the project requires DCO) - to certify the contribution - Taking full responsibility for the contribution -AI agents MUST NOT add `Co-Authored-By` trailers for themselves either. -A human reviewer owns the contribution; the AI's involvement is recorded -via `Assisted-by` (see below). +AI agents MUST NOT add `Co-Authored-By` trailers for themselves. A human +reviewer owns the contribution; the AI's involvement is recorded via +`Assisted-by` (see below). ## Attribution @@ -84,6 +98,12 @@ Assisted-by: Claude:claude-opus-4-7 golangci-lint Signed-off-by: Jane Developer ``` +The `Signed-off-by` line uses Jane's own identity because Jane is the +submitter operating the AI. If Jane asks Claude to create the commit via +`git commit -s`, git emits that exact trailer from Jane's configured +identity — no separate human step is needed beyond Jane reviewing the +diff before pushing. + ## Scope and Responsibility Using an AI assistant does not reduce the contributor's responsibility. diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 9ceb9d4f9..b370f0644 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -399,6 +399,19 @@ jobs: dockerfile: "./backend/Dockerfile.turboquant" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "8" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-buun-llama-cpp' + runs-on: 'bigger-runner' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "buun-llama-cpp" + dockerfile: "./backend/Dockerfile.buun-llama-cpp" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "8" @@ -894,6 +907,19 @@ jobs: dockerfile: "./backend/Dockerfile.turboquant" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-buun-llama-cpp' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "buun-llama-cpp" + dockerfile: "./backend/Dockerfile.buun-llama-cpp" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -920,6 +946,19 @@ jobs: backend: "turboquant" dockerfile: "./backend/Dockerfile.turboquant" context: "./" + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-buun-llama-cpp' + base-image: "ubuntu:24.04" + runs-on: 'ubuntu-24.04-arm' + ubuntu-version: '2404' + backend: "buun-llama-cpp" + dockerfile: "./backend/Dockerfile.buun-llama-cpp" + context: "./" - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -1454,6 +1493,19 @@ jobs: dockerfile: "./backend/Dockerfile.turboquant" context: "./" ubuntu-version: '2404' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-buun-llama-cpp' + runs-on: 'ubuntu-latest' + base-image: "rocm/dev-ubuntu-24.04:7.2.1" + skip-drivers: 'false' + backend: "buun-llama-cpp" + dockerfile: "./backend/Dockerfile.buun-llama-cpp" + context: "./" + ubuntu-version: '2404' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: "" @@ -1703,6 +1755,19 @@ jobs: dockerfile: "./backend/Dockerfile.turboquant" context: "./" ubuntu-version: '2404' + - build-type: 'sycl_f32' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f32-buun-llama-cpp' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "buun-llama-cpp" + dockerfile: "./backend/Dockerfile.buun-llama-cpp" + context: "./" + ubuntu-version: '2404' - build-type: 'sycl_f16' cuda-major-version: "" cuda-minor-version: "" @@ -1729,6 +1794,19 @@ jobs: dockerfile: "./backend/Dockerfile.turboquant" context: "./" ubuntu-version: '2404' + - build-type: 'sycl_f16' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f16-buun-llama-cpp' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "buun-llama-cpp" + dockerfile: "./backend/Dockerfile.buun-llama-cpp" + context: "./" + ubuntu-version: '2404' - build-type: 'intel' cuda-major-version: "" cuda-minor-version: "" @@ -2134,6 +2212,19 @@ jobs: dockerfile: "./backend/Dockerfile.turboquant" context: "./" ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-buun-llama-cpp' + runs-on: 'bigger-runner' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "buun-llama-cpp" + dockerfile: "./backend/Dockerfile.buun-llama-cpp" + context: "./" + ubuntu-version: '2404' - build-type: '' cuda-major-version: "" cuda-minor-version: "" @@ -2173,6 +2264,19 @@ jobs: dockerfile: "./backend/Dockerfile.turboquant" context: "./" ubuntu-version: '2204' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-arm64-buun-llama-cpp' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + runs-on: 'ubuntu-24.04-arm' + backend: "buun-llama-cpp" + dockerfile: "./backend/Dockerfile.buun-llama-cpp" + context: "./" + ubuntu-version: '2204' - build-type: 'vulkan' cuda-major-version: "" cuda-minor-version: "" @@ -2199,6 +2303,19 @@ jobs: dockerfile: "./backend/Dockerfile.turboquant" context: "./" ubuntu-version: '2404' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-buun-llama-cpp' + runs-on: 'bigger-runner' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "buun-llama-cpp" + dockerfile: "./backend/Dockerfile.buun-llama-cpp" + context: "./" + ubuntu-version: '2404' # Stablediffusion-ggml - build-type: '' cuda-major-version: "" diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index 67ab16938..b4356c456 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -32,6 +32,7 @@ jobs: llama-cpp: ${{ steps.detect.outputs.llama-cpp }} ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }} turboquant: ${{ steps.detect.outputs.turboquant }} + buun-llama-cpp: ${{ steps.detect.outputs['buun-llama-cpp'] }} vllm: ${{ steps.detect.outputs.vllm }} sglang: ${{ steps.detect.outputs.sglang }} acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }} @@ -613,6 +614,30 @@ jobs: - name: Build turboquant backend image and run gRPC e2e tests run: | make test-extra-backend-turboquant + tests-buun-llama-cpp-grpc: + needs: detect-changes + if: needs.detect-changes.outputs['buun-llama-cpp'] == 'true' || needs.detect-changes.outputs.run-all == 'true' + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - name: Clone + uses: actions/checkout@v6 + with: + submodules: true + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '1.25.4' + # Exercises the buun-llama-cpp (fork-of-a-fork) backend with the + # fork-specific TurboQuant/TCQ KV-cache types. BACKEND_TEST_CACHE_TYPE_V + # is set to turbo3 so the test round-trips through the fork's KV + # allow-list — picking a stock llama.cpp type would only re-test the + # shared code path. DFlash speculative decoding is not exercised here + # because the one known public target/drafter pair (Qwen3.5-27B) is too + # large for CI. + - name: Build buun-llama-cpp backend image and run gRPC e2e tests + run: | + make test-extra-backend-buun-llama-cpp # tests-vllm-grpc is currently disabled in CI. # # The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16 diff --git a/scripts/changed-backends.js b/scripts/changed-backends.js index 7ad770af9..13f8b143c 100644 --- a/scripts/changed-backends.js +++ b/scripts/changed-backends.js @@ -32,6 +32,12 @@ function inferBackendPath(item) { // via a thin wrapper Makefile. Changes to either dir should retrigger it. return `backend/cpp/turboquant/`; } + if (item.dockerfile.endsWith("buun-llama-cpp")) { + // buun-llama-cpp is a fork-of-a-fork (spiritbuun/buun-llama-cpp forks + // TheTom/llama-cpp-turboquant) that reuses backend/cpp/llama-cpp sources + // the same way turboquant does. Changes to either dir retrigger it. + return `backend/cpp/buun-llama-cpp/`; + } if (item.dockerfile.endsWith("llama-cpp")) { return `backend/cpp/llama-cpp/`; } @@ -138,9 +144,10 @@ async function getChangedFiles() { // Per-backend boolean outputs for (const [backend, pathPrefix] of allBackendPaths) { let changed = changedFiles.some(file => file.startsWith(pathPrefix)); - // turboquant reuses backend/cpp/llama-cpp sources via a thin wrapper; - // changes to either directory should retrigger its pipeline. - if (backend === "turboquant" && !changed) { + // turboquant and buun-llama-cpp reuse backend/cpp/llama-cpp sources via + // thin wrapper Makefiles; changes to that directory should retrigger + // their pipelines too. + if ((backend === "turboquant" || backend === "buun-llama-cpp") && !changed) { changed = changedFiles.some(file => file.startsWith("backend/cpp/llama-cpp/")); } fs.appendFileSync(process.env.GITHUB_OUTPUT, `${backend}=${changed ? 'true' : 'false'}\n`);