From 6233feb19013faa327c785a41c62b19e5dfc43b2 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 24 Apr 2026 12:52:44 +0000
Subject: [PATCH] ci(buun-llama-cpp): wire backend into test-extra + build
 matrix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the buun-llama-cpp backend to the same CI pipelines that turboquant
and sherpa-onnx already use:

- scripts/changed-backends.js: path resolution for Dockerfile.buun-llama-cpp,
  plus fork-of-fork detection (changes under backend/cpp/llama-cpp/ also
  retrigger the buun pipeline, mirroring how turboquant is handled).
- .github/workflows/test-extra.yml: detect-changes output and a new
  tests-buun-llama-cpp-grpc job that runs make test-extra-backend-buun-llama-cpp
  (turbo3 V-cache, same rationale as tests-turboquant-grpc).
- .github/workflows/backend.yml: 9 matrix entries (CUDA 12/13, L4T CUDA
  13 ARM64, ROCm, SYCL f32/f16, CPU, L4T ARM64, Vulkan) paired with each
  existing turboquant entry so image builds have platform parity.

Also updates .agents/ai-coding-assistants.md to clarify that AI agents
operating under the human submitter's git identity SHOULD emit
Signed-off-by via `git commit -s` (never inventing or guessing another
identity) — documents the workflow this PR is using.

Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .agents/ai-coding-assistants.md  |  38 +++++++---
 .github/workflows/backend.yml    | 117 +++++++++++++++++++++++++++++++
 .github/workflows/test-extra.yml |  25 +++++++
 scripts/changed-backends.js      |  13 +++-
 4 files changed, 181 insertions(+), 12 deletions(-)

diff --git a/.agents/ai-coding-assistants.md b/.agents/ai-coding-assistants.md
index d0d9c882c..0f94c70cf 100644
--- a/.agents/ai-coding-assistants.md
+++ b/.agents/ai-coding-assistants.md
@@ -35,19 +35,33 @@ All contributions must comply with LocalAI's licensing requirements:
 
 ## Signed-off-by and Developer Certificate of Origin
 
-**AI agents MUST NOT add `Signed-off-by` tags.** Only humans can legally
-certify the Developer Certificate of Origin (DCO). The human submitter
-is responsible for:
+Only humans can certify the Developer Certificate of Origin (DCO). AI
+agents MUST NOT invent or guess a human identity for `Signed-off-by` —
+doing so forges the DCO certification.
 
-- Reviewing all AI-generated code
+However, when a human operator explicitly directs the AI to commit on
+their behalf, the AI is acting as a typing tool — no different from an
+editor macro or `git commit -s`. In that case the AI SHOULD add
+`Signed-off-by:` using the **configured `user.name` / `user.email`** of
+the current git repository (i.e. the operator's own identity). The
+resulting trailer is the operator's signature; they take responsibility
+for it by reviewing and pushing the commit. The AI MUST NOT use any
+other identity and MUST NOT add its own name to the sign-off.
+
+When running `git commit`, prefer `git commit --signoff` (or `-s`) so
+the trailer is emitted by git itself from the configured identity,
+rather than hand-writing it in a heredoc — this guarantees the sign-off
+matches whatever identity the operator is currently using.
+
+The human submitter remains responsible for:
+
+- Reviewing all AI-generated code before it's pushed or merged
 - Ensuring compliance with licensing requirements
-- Adding their own `Signed-off-by` tag (when the project requires DCO)
-  to certify the contribution
 - Taking full responsibility for the contribution
 
-AI agents MUST NOT add `Co-Authored-By` trailers for themselves either.
-A human reviewer owns the contribution; the AI's involvement is recorded
-via `Assisted-by` (see below).
+AI agents MUST NOT add `Co-Authored-By` trailers for themselves. A human
+reviewer owns the contribution; the AI's involvement is recorded via
+`Assisted-by` (see below).
 
 ## Attribution
 
@@ -84,6 +98,12 @@ Assisted-by: Claude:claude-opus-4-7 golangci-lint
 Signed-off-by: Jane Developer <jane@example.com>
 ```
 
+The `Signed-off-by` line uses Jane's own identity because Jane is the
+submitter operating the AI. If Jane asks Claude to create the commit via
+`git commit -s`, git emits that exact trailer from Jane's configured
+identity — no separate human step is needed beyond Jane reviewing the
+diff before pushing.
+
 ## Scope and Responsibility
 
 Using an AI assistant does not reduce the contributor's responsibility.
diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index 9ceb9d4f9..b370f0644 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -399,6 +399,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.turboquant"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "8"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-12-buun-llama-cpp'
+            runs-on: 'bigger-runner'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: 'cublas'
             cuda-major-version: "12"
             cuda-minor-version: "8"
@@ -894,6 +907,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.turboquant"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-buun-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: 'cublas'
             cuda-major-version: "13"
             cuda-minor-version: "0"
@@ -920,6 +946,19 @@ jobs:
             backend: "turboquant"
             dockerfile: "./backend/Dockerfile.turboquant"
             context: "./"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            skip-drivers: 'false'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-cuda-13-arm64-buun-llama-cpp'
+            base-image: "ubuntu:24.04"
+            runs-on: 'ubuntu-24.04-arm'
+            ubuntu-version: '2404'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
           - build-type: 'cublas'
             cuda-major-version: "13"
             cuda-minor-version: "0"
@@ -1454,6 +1493,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.turboquant"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-rocm-hipblas-buun-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-24.04:7.2.1"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: 'hipblas'
             cuda-major-version: ""
             cuda-minor-version: ""
@@ -1703,6 +1755,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.turboquant"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: 'sycl_f32'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-intel-sycl-f32-buun-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: 'sycl_f16'
             cuda-major-version: ""
             cuda-minor-version: ""
@@ -1729,6 +1794,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.turboquant"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: 'sycl_f16'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-intel-sycl-f16-buun-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: 'intel'
             cuda-major-version: ""
             cuda-minor-version: ""
@@ -2134,6 +2212,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.turboquant"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-cpu-buun-llama-cpp'
+            runs-on: 'bigger-runner'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: ''
             cuda-major-version: ""
             cuda-minor-version: ""
@@ -2173,6 +2264,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.turboquant"
             context: "./"
             ubuntu-version: '2204'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            skip-drivers: 'false'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-arm64-buun-llama-cpp'
+            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+            runs-on: 'ubuntu-24.04-arm'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2204'
           - build-type: 'vulkan'
             cuda-major-version: ""
             cuda-minor-version: ""
@@ -2199,6 +2303,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.turboquant"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: 'vulkan'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-vulkan-buun-llama-cpp'
+            runs-on: 'bigger-runner'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "buun-llama-cpp"
+            dockerfile: "./backend/Dockerfile.buun-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
           # Stablediffusion-ggml
           - build-type: ''
             cuda-major-version: ""
diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml
index 67ab16938..b4356c456 100644
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -32,6 +32,7 @@ jobs:
       llama-cpp: ${{ steps.detect.outputs.llama-cpp }}
       ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
       turboquant: ${{ steps.detect.outputs.turboquant }}
+      buun-llama-cpp: ${{ steps.detect.outputs['buun-llama-cpp'] }}
       vllm: ${{ steps.detect.outputs.vllm }}
       sglang: ${{ steps.detect.outputs.sglang }}
       acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
@@ -613,6 +614,30 @@ jobs:
       - name: Build turboquant backend image and run gRPC e2e tests
         run: |
           make test-extra-backend-turboquant
+  tests-buun-llama-cpp-grpc:
+    needs: detect-changes
+    if: needs.detect-changes.outputs['buun-llama-cpp'] == 'true' || needs.detect-changes.outputs.run-all == 'true'
+    runs-on: ubuntu-latest
+    timeout-minutes: 90
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          submodules: true
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.25.4'
+      # Exercises the buun-llama-cpp (fork-of-a-fork) backend with the
+      # fork-specific TurboQuant/TCQ KV-cache types. BACKEND_TEST_CACHE_TYPE_V
+      # is set to turbo3 so the test round-trips through the fork's KV
+      # allow-list — picking a stock llama.cpp type would only re-test the
+      # shared code path. DFlash speculative decoding is not exercised here
+      # because the one known public target/drafter pair (Qwen3.5-27B) is too
+      # large for CI.
+      - name: Build buun-llama-cpp backend image and run gRPC e2e tests
+        run: |
+          make test-extra-backend-buun-llama-cpp
   # tests-vllm-grpc is currently disabled in CI.
   #
   # The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16
diff --git a/scripts/changed-backends.js b/scripts/changed-backends.js
index 7ad770af9..13f8b143c 100644
--- a/scripts/changed-backends.js
+++ b/scripts/changed-backends.js
@@ -32,6 +32,12 @@ function inferBackendPath(item) {
     // via a thin wrapper Makefile. Changes to either dir should retrigger it.
     return `backend/cpp/turboquant/`;
   }
+  if (item.dockerfile.endsWith("buun-llama-cpp")) {
+    // buun-llama-cpp is a fork-of-a-fork (spiritbuun/buun-llama-cpp forks
+    // TheTom/llama-cpp-turboquant) that reuses backend/cpp/llama-cpp sources
+    // the same way turboquant does. Changes to either dir retrigger it.
+    return `backend/cpp/buun-llama-cpp/`;
+  }
   if (item.dockerfile.endsWith("llama-cpp")) {
     return `backend/cpp/llama-cpp/`;
   }
@@ -138,9 +144,10 @@ async function getChangedFiles() {
   // Per-backend boolean outputs
   for (const [backend, pathPrefix] of allBackendPaths) {
     let changed = changedFiles.some(file => file.startsWith(pathPrefix));
-    // turboquant reuses backend/cpp/llama-cpp sources via a thin wrapper;
-    // changes to either directory should retrigger its pipeline.
-    if (backend === "turboquant" && !changed) {
+    // turboquant and buun-llama-cpp reuse backend/cpp/llama-cpp sources via
+    // thin wrapper Makefiles; changes to that directory should retrigger
+    // their pipelines too.
+    if ((backend === "turboquant" || backend === "buun-llama-cpp") && !changed) {
       changed = changedFiles.some(file => file.startsWith("backend/cpp/llama-cpp/"));
     }
     fs.appendFileSync(process.env.GITHUB_OUTPUT, `${backend}=${changed ? 'true' : 'false'}\n`);