From 35f6db8c76cdc658abc3c356f6193ecccdcd97df Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 10 May 2026 18:15:53 +0200
Subject: [PATCH] ci: split backend-jobs into single-arch and multi-arch
 matrices (#9746)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Symptom (run 25612992409): backend-merge-jobs failed with
"quay.io/go-skynet/local-ai-backends@sha256:fdbd93ca...: not found"
even though the per-arch build for -cpu-llama-cpp pushed that exact
digest 14h31m earlier.

Root cause: backend-merge-jobs was gated on the WHOLE backend-jobs
matrix (`needs: backend-jobs`). The multi-arch -cpu-llama-cpp legs
finished within 30 min, but a single-arch CUDA-12-llama-cpp slot in
the same matrix queued for ~8h (max-parallel: 8 throttle) and then
took ~6h to build cold. By the time it freed the merge to run, quay's
GC had reaped the per-arch digests pushed by the fast multi-arch legs
the day before.

Fix: split the linux backend matrix in two.

  backend-jobs-multiarch  - entries with `platform-tag` set (paired
    per-arch legs that feed backend-merge-jobs).
  backend-jobs-singlearch - entries without `platform-tag` (heavy
    standalone builds: CUDA, ROCm, Intel oneAPI, vLLM, sglang, etc.).

backend-merge-jobs now `needs:` only backend-jobs-multiarch. The
multi-arch matrix completes in ~2-3h, well inside quay's GC window.
Heavy single-arch entries keep running independently with no merge
dependency.

scripts/changed-backends.js gains a splitByArch() helper that
partitions filtered entries by whether `platform-tag` is set, and
emits matrix-singlearch + matrix-multiarch + has-backends-singlearch
+ has-backends-multiarch outputs (replacing the previous combined
matrix / has-backends pair). Applied in both the full-matrix and
filtered-matrix code paths. Smoke test: 199 single-arch + 72 multi-
arch + 35 darwin = 271 total entries; 36 merge-matrix entries
(one per multi-arch backend pair). Matches expectation.

Local `make backends/<name>` is unaffected — the script's outputs
only feed CI workflow matrices.

Assisted-by: Claude:claude-opus-4-7

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/backend.yml    | 70 ++++++++++++++++++++++++++++----
 .github/workflows/backend_pr.yml | 52 +++++++++++++++++++-----
 scripts/changed-backends.js      | 32 ++++++++++++---
 3 files changed, 129 insertions(+), 25 deletions(-)
diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index 6e6c4ab33..b09e14d90 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -32,11 +32,13 @@ jobs:
     if: github.repository == 'mudler/LocalAI'
     runs-on: ubuntu-latest
     outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      matrix-darwin: ${{ steps.set-matrix.outputs.matrix-darwin }}
+      matrix-singlearch: ${{ steps.set-matrix.outputs['matrix-singlearch'] }}
+      matrix-multiarch: ${{ steps.set-matrix.outputs['matrix-multiarch'] }}
+      matrix-darwin: ${{ steps.set-matrix.outputs['matrix-darwin'] }}
       merge-matrix: ${{ steps.set-matrix.outputs['merge-matrix'] }}
-      has-backends: ${{ steps.set-matrix.outputs.has-backends }}
-      has-backends-darwin: ${{ steps.set-matrix.outputs.has-backends-darwin }}
+      has-backends-singlearch: ${{ steps.set-matrix.outputs['has-backends-singlearch'] }}
+      has-backends-multiarch: ${{ steps.set-matrix.outputs['has-backends-multiarch'] }}
+      has-backends-darwin: ${{ steps.set-matrix.outputs['has-backends-darwin'] }}
       has-merges: ${{ steps.set-matrix.outputs['has-merges'] }}
     steps:
       - name: Checkout repository
@@ -53,6 +55,9 @@ jobs:
       # Filter the backend matrix from .github/backend-matrix.yml against the
       # files changed by this push. Tag pushes set FORCE_ALL=true so the script
       # falls through to the full matrix (releases must rebuild everything).
+      # The script splits the linux matrix into single-arch and multi-arch
+      # groups so backend-merge-jobs can `needs:` only the multi-arch one —
+      # see the comment block above the merge job for context.
       - name: Filter matrix for changed backends
         id: set-matrix
         env:
@@ -61,9 +66,14 @@ jobs:
           FORCE_ALL: ${{ startsWith(github.ref, 'refs/tags/') && 'true' || 'false' }}
         run: bun run scripts/changed-backends.js
 
-  backend-jobs:
+  # Multi-arch backends — entries with a `platform-tag` set, paired with a
+  # sibling entry sharing the same `tag-suffix` (one amd64 leg, one arm64
+  # leg). Their digests are the inputs to backend-merge-jobs, so they're in
+  # their own matrix to bound how long the merge waits before quay GCs the
+  # untagged digests.
+  backend-jobs-multiarch:
     needs: generate-matrix
-    if: needs.generate-matrix.outputs.has-backends == 'true'
+    if: needs.generate-matrix.outputs['has-backends-multiarch'] == 'true'
     uses: ./.github/workflows/backend_build.yml
     with:
       tag-latest: ${{ matrix.tag-latest }}
@@ -90,11 +100,53 @@ jobs:
     strategy:
       fail-fast: false
       max-parallel: 8
-      matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
+      matrix: ${{ fromJson(needs.generate-matrix.outputs['matrix-multiarch']) }}
 
+  # Single-arch backends — no `platform-tag`. Heavy ones (CUDA, ROCm, Intel
+  # oneAPI, vLLM/sglang) live here. Independent of the merge job: they can
+  # take their full ~6h cold without blocking manifest assembly for the
+  # multi-arch backends whose per-arch digests would otherwise sit untagged
+  # on quay long enough to be GC'd.
+  backend-jobs-singlearch:
+    needs: generate-matrix
+    if: needs.generate-matrix.outputs['has-backends-singlearch'] == 'true'
+    uses: ./.github/workflows/backend_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      platform-tag: ${{ matrix.platform-tag || '' }}
+      runs-on: ${{ matrix.runs-on }}
+      builder-base-image: ${{ matrix.builder-base-image || '' }}
+      base-image: ${{ matrix.base-image }}
+      backend: ${{ matrix.backend }}
+      dockerfile: ${{ matrix.dockerfile }}
+      skip-drivers: ${{ matrix.skip-drivers }}
+      context: ${{ matrix.context }}
+      ubuntu-version: ${{ matrix.ubuntu-version }}
+      amdgpu-targets: ${{ matrix.amdgpu-targets || 'gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201' }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      fail-fast: false
+      max-parallel: 8
+      matrix: ${{ fromJson(needs.generate-matrix.outputs['matrix-singlearch']) }}
+
+  # Merge per-arch digests into manifest lists. Depends ONLY on
+  # backend-jobs-multiarch — single-arch builds are independent and slow.
+  # Without this split, a 6h CUDA-12 single-arch job would gate the merge,
+  # leaving multi-arch digests untagged on quay long enough for quay's
+  # garbage collector to reap them and the merge step to fail with
+  # "manifest not found".
   backend-merge-jobs:
-    needs: [generate-matrix, backend-jobs]
-    if: needs.generate-matrix.outputs.has-merges == 'true'
+    needs: [generate-matrix, backend-jobs-multiarch]
+    if: needs.generate-matrix.outputs['has-merges'] == 'true'
     uses: ./.github/workflows/backend_merge.yml
     with:
       tag-latest: ${{ matrix.tag-latest }}
diff --git a/.github/workflows/backend_pr.yml b/.github/workflows/backend_pr.yml
index b01e5dddc..cf7a96a99 100644
--- a/.github/workflows/backend_pr.yml
+++ b/.github/workflows/backend_pr.yml
@@ -11,11 +11,13 @@ jobs:
   generate-matrix:
     runs-on: ubuntu-latest
     outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      matrix-darwin: ${{ steps.set-matrix.outputs.matrix-darwin }}
+      matrix-singlearch: ${{ steps.set-matrix.outputs['matrix-singlearch'] }}
+      matrix-multiarch: ${{ steps.set-matrix.outputs['matrix-multiarch'] }}
+      matrix-darwin: ${{ steps.set-matrix.outputs['matrix-darwin'] }}
       merge-matrix: ${{ steps.set-matrix.outputs['merge-matrix'] }}
-      has-backends: ${{ steps.set-matrix.outputs.has-backends }}
-      has-backends-darwin: ${{ steps.set-matrix.outputs.has-backends-darwin }}
+      has-backends-singlearch: ${{ steps.set-matrix.outputs['has-backends-singlearch'] }}
+      has-backends-multiarch: ${{ steps.set-matrix.outputs['has-backends-multiarch'] }}
+      has-backends-darwin: ${{ steps.set-matrix.outputs['has-backends-darwin'] }}
       has-merges: ${{ steps.set-matrix.outputs['has-merges'] }}
     steps:
       - name: Checkout repository
@@ -29,7 +31,9 @@ jobs:
           bun add js-yaml
           bun add @octokit/core
 
-      # filters the matrix in backend.yml
+      # filters the matrix in backend.yml; splits into single-arch and
+      # multi-arch groups so backend-merge-jobs can `needs:` only the latter
+      # (matches backend.yml's structure).
       - name: Filter matrix for changed backends
         id: set-matrix
         env:
@@ -37,10 +41,10 @@ jobs:
           GITHUB_EVENT_PATH: ${{ github.event_path }}
         run: bun run scripts/changed-backends.js
 
-  backend-jobs:
+  backend-jobs-multiarch:
     needs: generate-matrix
     uses: ./.github/workflows/backend_build.yml
-    if: needs.generate-matrix.outputs.has-backends == 'true'
+    if: needs.generate-matrix.outputs['has-backends-multiarch'] == 'true'
     with:
       tag-latest: ${{ matrix.tag-latest }}
       tag-suffix: ${{ matrix.tag-suffix }}
@@ -64,13 +68,41 @@ jobs:
     strategy:
       fail-fast: true
       max-parallel: 8
-      matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
+      matrix: ${{ fromJson(needs.generate-matrix.outputs['matrix-multiarch']) }}
+  backend-jobs-singlearch:
+    needs: generate-matrix
+    uses: ./.github/workflows/backend_build.yml
+    if: needs.generate-matrix.outputs['has-backends-singlearch'] == 'true'
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      platform-tag: ${{ matrix.platform-tag || '' }}
+      runs-on: ${{ matrix.runs-on }}
+      builder-base-image: ${{ matrix.builder-base-image || '' }}
+      base-image: ${{ matrix.base-image }}
+      backend: ${{ matrix.backend }}
+      dockerfile: ${{ matrix.dockerfile }}
+      skip-drivers: ${{ matrix.skip-drivers }}
+      context: ${{ matrix.context }}
+      ubuntu-version: ${{ matrix.ubuntu-version }}
+      amdgpu-targets: ${{ matrix.amdgpu-targets || 'gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201' }}
+    secrets:
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      fail-fast: true
+      max-parallel: 8
+      matrix: ${{ fromJson(needs.generate-matrix.outputs['matrix-singlearch']) }}
   backend-merge-jobs:
-    needs: [generate-matrix, backend-jobs]
+    needs: [generate-matrix, backend-jobs-multiarch]
     # backend_merge.yml's push-side steps are all gated on
     # github.event_name != 'pull_request', so on a PR the merge job would
     # do nothing. Skip it entirely to avoid spinning up an empty runner.
-    if: github.event_name != 'pull_request' && needs.generate-matrix.outputs.has-merges == 'true'
+    if: github.event_name != 'pull_request' && needs.generate-matrix.outputs['has-merges'] == 'true'
     uses: ./.github/workflows/backend_merge.yml
     with:
       tag-latest: ${{ matrix.tag-latest }}
diff --git a/scripts/changed-backends.js b/scripts/changed-backends.js
index a006ea80f..95d9e356f 100644
--- a/scripts/changed-backends.js
+++ b/scripts/changed-backends.js
@@ -163,14 +163,29 @@ function computeMergeMatrix(entries) {
   return { include };
 }
 
+// Split a list of linux matrix entries into single-arch (no platform-tag) and
+// multi-arch (platform-tag set, paired with a sibling entry sharing the same
+// tag-suffix). The two are run as separate matrix jobs so backend-merge-jobs
+// can `needs:` only the multi-arch one — slow single-arch builds (CUDA, ROCm,
+// vLLM, etc.) don't block manifest assembly while their per-arch counterparts'
+// untagged digests sit on quay long enough to be GC'd.
+function splitByArch(entries) {
+  const multiarch = entries.filter(e => e['platform-tag']);
+  const singlearch = entries.filter(e => !e['platform-tag']);
+  return { multiarch, singlearch };
+}
+
 function emitFullMatrix() {
+  const { multiarch, singlearch } = splitByArch(includes);
   const mergeMatrix = computeMergeMatrix(includes);
   const hasMerges = mergeMatrix.include.length > 0 ? 'true' : 'false';
   fs.appendFileSync(process.env.GITHUB_OUTPUT, `run-all=true\n`);
-  fs.appendFileSync(process.env.GITHUB_OUTPUT, `has-backends=true\n`);
+  fs.appendFileSync(process.env.GITHUB_OUTPUT, `has-backends-singlearch=${singlearch.length > 0 ? 'true' : 'false'}\n`);
+  fs.appendFileSync(process.env.GITHUB_OUTPUT, `has-backends-multiarch=${multiarch.length > 0 ? 'true' : 'false'}\n`);
   fs.appendFileSync(process.env.GITHUB_OUTPUT, `has-backends-darwin=true\n`);
   fs.appendFileSync(process.env.GITHUB_OUTPUT, `has-merges=${hasMerges}\n`);
-  fs.appendFileSync(process.env.GITHUB_OUTPUT, `matrix=${JSON.stringify({ include: includes })}\n`);
+  fs.appendFileSync(process.env.GITHUB_OUTPUT, `matrix-singlearch=${JSON.stringify({ include: singlearch })}\n`);
+  fs.appendFileSync(process.env.GITHUB_OUTPUT, `matrix-multiarch=${JSON.stringify({ include: multiarch })}\n`);
   fs.appendFileSync(process.env.GITHUB_OUTPUT, `matrix-darwin=${JSON.stringify({ include: includesDarwin })}\n`);
   fs.appendFileSync(process.env.GITHUB_OUTPUT, `merge-matrix=${JSON.stringify(mergeMatrix)}\n`);
   for (const backend of allBackendPaths.keys()) {
@@ -195,19 +210,24 @@ function emitFilteredMatrix(changedFiles) {
   console.log("Filtered files:", filtered);
   console.log("Filtered files Darwin:", filteredDarwin);
 
-  const hasBackends = filtered.length > 0 ? 'true' : 'false';
+  const { multiarch, singlearch } = splitByArch(filtered);
+  const hasBackendsSinglearch = singlearch.length > 0 ? 'true' : 'false';
+  const hasBackendsMultiarch = multiarch.length > 0 ? 'true' : 'false';
   const hasBackendsDarwin = filteredDarwin.length > 0 ? 'true' : 'false';
-  console.log("Has backends?:", hasBackends);
+  console.log("Has single-arch backends?:", hasBackendsSinglearch);
+  console.log("Has multi-arch backends?:", hasBackendsMultiarch);
   console.log("Has Darwin backends?:", hasBackendsDarwin);
 
   const mergeMatrix = computeMergeMatrix(filtered);
   const hasMerges = mergeMatrix.include.length > 0 ? 'true' : 'false';
 
   fs.appendFileSync(process.env.GITHUB_OUTPUT, `run-all=false\n`);
-  fs.appendFileSync(process.env.GITHUB_OUTPUT, `has-backends=${hasBackends}\n`);
+  fs.appendFileSync(process.env.GITHUB_OUTPUT, `has-backends-singlearch=${hasBackendsSinglearch}\n`);
+  fs.appendFileSync(process.env.GITHUB_OUTPUT, `has-backends-multiarch=${hasBackendsMultiarch}\n`);
   fs.appendFileSync(process.env.GITHUB_OUTPUT, `has-backends-darwin=${hasBackendsDarwin}\n`);
   fs.appendFileSync(process.env.GITHUB_OUTPUT, `has-merges=${hasMerges}\n`);
-  fs.appendFileSync(process.env.GITHUB_OUTPUT, `matrix=${JSON.stringify({ include: filtered })}\n`);
+  fs.appendFileSync(process.env.GITHUB_OUTPUT, `matrix-singlearch=${JSON.stringify({ include: singlearch })}\n`);
+  fs.appendFileSync(process.env.GITHUB_OUTPUT, `matrix-multiarch=${JSON.stringify({ include: multiarch })}\n`);
   fs.appendFileSync(process.env.GITHUB_OUTPUT, `matrix-darwin=${JSON.stringify({ include: filteredDarwin })}\n`);
   fs.appendFileSync(process.env.GITHUB_OUTPUT, `merge-matrix=${JSON.stringify(mergeMatrix)}\n`);