diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index 903d415ab..4aca4185e 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -389,7 +389,12 @@ include: tag-latest: 'auto' tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp' builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-12-amd64' - runs-on: 'ubuntu-latest' + # bigger-runner: cold builds for this entry consistently take 5h+ on + # ubuntu-latest (observed 5h36m on v4.2.1). Move back to bigger-runner + # so the build finishes well within GHA's 6h job timeout. Phase 5.3 of + # the free-tier migration (PR #9730) flipped this to ubuntu-latest as + # a 'highest-risk batch' with explicit per-entry revert. + runs-on: 'bigger-runner' base-image: "ubuntu:24.04" skip-drivers: 'false' backend: "llama-cpp" @@ -403,7 +408,9 @@ include: tag-latest: 'auto' tag-suffix: '-gpu-nvidia-cuda-12-turboquant' builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-12-amd64' - runs-on: 'ubuntu-latest' + # bigger-runner: same rationale as -gpu-nvidia-cuda-12-llama-cpp above + # (observed 6h5m wall-clock on v4.2.1, just past the 6h job timeout). + runs-on: 'bigger-runner' base-image: "ubuntu:24.04" skip-drivers: 'false' backend: "turboquant" @@ -899,7 +906,9 @@ include: tag-latest: 'auto' tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp' builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-13-amd64' - runs-on: 'ubuntu-latest' + # bigger-runner: cold builds for this entry take 5h+ on ubuntu-latest + # (observed 5h37m on v4.2.1). Same rationale as the cuda-12 variant. + runs-on: 'bigger-runner' base-image: "ubuntu:24.04" skip-drivers: 'false' backend: "llama-cpp" @@ -913,7 +922,8 @@ include: tag-latest: 'auto' tag-suffix: '-gpu-nvidia-cuda-13-turboquant' builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-13-amd64' - runs-on: 'ubuntu-latest' + # bigger-runner: observed 6h5m wall-clock on v4.2.1 — at the GHA timeout. + runs-on: 'bigger-runner' base-image: "ubuntu:24.04" skip-drivers: 'false' backend: "turboquant" diff --git a/.github/scripts/anchor-digest-in-cache.sh b/.github/scripts/anchor-digest-in-cache.sh new file mode 100755 index 000000000..409192788 --- /dev/null +++ b/.github/scripts/anchor-digest-in-cache.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# Anchor a backend per-arch digest in quay.io/go-skynet/ci-cache so quay's +# garbage collector won't reap the manifest before backend_merge.yml runs. +# +# Context: backend_build.yml pushes by canonical digest only +# (push-by-digest=true). Unreferenced manifests on quay can be reaped within +# ~1-2h, but backend-merge-jobs runs only after the *entire* per-arch build +# matrix drains (max-parallel: 8 × dozens of entries → ~2h+). Without an +# anchoring tag, the earliest digests are gone by the time `imagetools create` +# tries to read them, producing "manifest not found" merge failures. +# +# We tag the digest under our internal ci-cache image; quay does not GC tagged +# manifests. The user-facing manifest list still references the original +# digest in local-ai-backends. backend_merge.yml deletes the anchor tag after +# the user-facing manifest is published — see cleanup-keepalive-tags.sh. +# +# Required env: +# GITHUB_RUN_ID - current workflow run id (set automatically by GHA) +# TAG_SUFFIX - matrix entry's tag-suffix (e.g. -gpu-nvidia-cuda-12-vllm) +# PLATFORM_TAG - amd64 / arm64 / single (single = singleton matrix entry) +# DIGEST - canonical content digest from build step (sha256:...) +# +# Optional env: +# ANCHOR_IMAGE - target image (default: quay.io/go-skynet/ci-cache) +# SOURCE_IMAGE - source image (default: quay.io/go-skynet/local-ai-backends) +# GITHUB_STEP_SUMMARY - if set, an anchored-by line is appended to it +set -euo pipefail + +: "${GITHUB_RUN_ID:?}" +: "${TAG_SUFFIX:?}" +: "${PLATFORM_TAG:?}" +: "${DIGEST:?}" + +anchor_image="${ANCHOR_IMAGE:-quay.io/go-skynet/ci-cache}" +source_image="${SOURCE_IMAGE:-quay.io/go-skynet/local-ai-backends}" + +tag="keepalive-${GITHUB_RUN_ID}${TAG_SUFFIX}-${PLATFORM_TAG}" + +docker buildx imagetools create \ + -t "${anchor_image}:${tag}" \ + "${source_image}@${DIGEST}" + +echo "anchored ${DIGEST} as ${anchor_image}:${tag}" +if [[ -n "${GITHUB_STEP_SUMMARY:-}" ]]; then + echo "anchored \`${DIGEST}\` as \`${anchor_image}:${tag}\`" >> "${GITHUB_STEP_SUMMARY}" +fi diff --git a/.github/scripts/cleanup-keepalive-tags.sh b/.github/scripts/cleanup-keepalive-tags.sh new file mode 100755 index 000000000..c536269d6 --- /dev/null +++ b/.github/scripts/cleanup-keepalive-tags.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# Best-effort cleanup of the keepalive anchor tags written by +# anchor-digest-in-cache.sh. Called from backend_merge.yml after the +# user-facing manifest list has been published. +# +# Quay's docker registry v2 doesn't allow tag deletes — only digest deletes. +# The proper delete is the quay REST API, which requires an OAuth-scoped +# token. We try QUAY_TOKEN as a bearer token: if the secret is an OAuth app +# token (typical for service accounts) the delete succeeds; otherwise this +# is a soft no-op and the tag persists until manually pruned. +# +# Cleanup failure MUST NOT fail the merge — the merge has already produced +# the user-facing manifest list at this point and the keepalive tags are +# pure overhead. We always exit 0. +# +# Required env: +# GITHUB_RUN_ID - current workflow run id (set automatically by GHA) +# TAG_SUFFIX - matrix entry's tag-suffix (e.g. -gpu-nvidia-cuda-12-vllm) +# QUAY_TOKEN - bearer token for quay's REST API +# +# Optional env: +# QUAY_REPO - target repo (default: go-skynet/ci-cache) +# PLATFORM_TAGS - space-separated list of platform-tag values to try +# (default: "amd64 arm64 single") +# We don't know which platform-tag(s) exist for this +# tag-suffix without an extra API call, so we just try +# all three and ignore 404s for the ones that don't. +set -uo pipefail + +: "${GITHUB_RUN_ID:?}" +: "${TAG_SUFFIX:?}" +: "${QUAY_TOKEN:?}" + +quay_repo="${QUAY_REPO:-go-skynet/ci-cache}" +platform_tags="${PLATFORM_TAGS:-amd64 arm64 single}" + +for plat in $platform_tags; do + tag="keepalive-${GITHUB_RUN_ID}${TAG_SUFFIX}-${plat}" + url="https://quay.io/api/v1/repository/${quay_repo}/tag/${tag}" + http=$(curl -sS -o /dev/null -w '%{http_code}' \ + -X DELETE -H "Authorization: Bearer ${QUAY_TOKEN}" "$url" || echo "000") + case "$http" in + 204|200) echo "deleted $tag" ;; + 404) echo "not present: $tag" ;; + 401|403) echo "auth not OAuth-scoped (http $http) for $tag - skipping; orphan tag will persist" ;; + *) echo "unexpected http $http deleting $tag - skipping" ;; + esac +done +exit 0 diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 3afe0c681..b41c3d4dd 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -154,7 +154,13 @@ jobs: # digest only — no tags are applied at build time. backend-merge-jobs-multiarch: needs: [generate-matrix, backend-jobs-multiarch] - if: needs.generate-matrix.outputs['has-merges-multiarch'] == 'true' + # !cancelled() lets the merge run even when a few build legs failed. + # Without it, GHA's default `needs:` cascade skips the entire merge + # matrix on a single failed/cancelled cell. We still want to publish + # the manifest lists for tag-suffixes whose legs all succeeded. + # Observed in v4.2.1: 2 singlearch build failures cascade-skipped all + # ~199 singlearch merge entries. + if: ${{ !cancelled() && needs.generate-matrix.outputs['has-merges-multiarch'] == 'true' }} uses: ./.github/workflows/backend_merge.yml with: tag-latest: ${{ matrix.tag-latest }} @@ -170,7 +176,8 @@ jobs: backend-merge-jobs-singlearch: needs: [generate-matrix, backend-jobs-singlearch] - if: needs.generate-matrix.outputs['has-merges-singlearch'] == 'true' + # See note on backend-merge-jobs-multiarch above for !cancelled(). + if: ${{ !cancelled() && needs.generate-matrix.outputs['has-merges-singlearch'] == 'true' }} uses: ./.github/workflows/backend_merge.yml with: tag-latest: ${{ matrix.tag-latest }} diff --git a/.github/workflows/backend_build.yml b/.github/workflows/backend_build.yml index 7327287ce..b3e177bd1 100644 --- a/.github/workflows/backend_build.yml +++ b/.github/workflows/backend_build.yml @@ -228,6 +228,16 @@ jobs: digest="${{ steps.build.outputs.digest }}" touch "/tmp/digests/${digest#sha256:}" + # See .github/scripts/anchor-digest-in-cache.sh for why this is needed + # and how it interacts with backend_merge.yml's cleanup step. + - name: Anchor digest in ci-cache so quay GC won't reap before merge + if: github.event_name != 'pull_request' + env: + TAG_SUFFIX: ${{ inputs.tag-suffix }} + PLATFORM_TAG: ${{ inputs.platform-tag || 'single' }} + DIGEST: ${{ steps.build.outputs.digest }} + run: .github/scripts/anchor-digest-in-cache.sh + # Artifact name uses a `--` separator between tag-suffix and platform-tag # to avoid prefix collisions during the merge job's pattern-based download. # Tag-suffixes are not prefix-disjoint (e.g. -gpu-nvidia-cuda-12-vllm is a diff --git a/.github/workflows/backend_build_darwin.yml b/.github/workflows/backend_build_darwin.yml index ac39389f3..4c87e8d66 100644 --- a/.github/workflows/backend_build_darwin.yml +++ b/.github/workflows/backend_build_darwin.yml @@ -116,6 +116,13 @@ jobs: # already), we don't have to chase missing dylibs one at a time. # The downloads cache makes the reinstall fast (~5s on a hit). brew reinstall ccache + # Same pattern for grpc: its CMake config (used by the llama-cpp + # `grpc-server` target) does find_package(absl). The cache restores + # /opt/homebrew/Cellar/grpc so brew above no-ops the install, but + # abseil isn't in our Cellar cache list and never gets installed + # alongside, leaving grpc's CMake unable to resolve it. Reinstalling + # grpc re-validates and pulls abseil in, mirroring the ccache fix. + brew reinstall grpc # The brew cache restores the Cellar dirs but NOT the bin symlinks # at /opt/homebrew/bin/*. brew install above sees the Cellar present # and decides "already installed" without re-linking, so on a cache- diff --git a/.github/workflows/backend_merge.yml b/.github/workflows/backend_merge.yml index 466a5d843..0490cc6b3 100644 --- a/.github/workflows/backend_merge.yml +++ b/.github/workflows/backend_merge.yml @@ -34,6 +34,15 @@ jobs: env: quay_username: ${{ secrets.quayUsername }} steps: + # Sparse checkout: the merge job needs `.github/scripts/` (for the + # keepalive cleanup script) but none of the source tree. + - name: Checkout (.github/scripts only) + uses: actions/checkout@v6 + with: + sparse-checkout: | + .github/scripts + sparse-checkout-cone-mode: false + # `--` separator anchors the glob so we don't over-match sibling # backends whose tag-suffix happens to be a prefix of ours # (e.g. -cpu-vllm vs -cpu-vllm-omni). Must stay in sync with the @@ -126,6 +135,15 @@ jobs: docker buildx imagetools inspect "$first_tag" fi + # See .github/scripts/cleanup-keepalive-tags.sh for why this is + # best-effort and what the failure modes are. + - name: Cleanup keepalive tags in ci-cache + if: github.event_name != 'pull_request' && success() + env: + TAG_SUFFIX: ${{ inputs.tag-suffix }} + QUAY_TOKEN: ${{ secrets.quayPassword }} + run: .github/scripts/cleanup-keepalive-tags.sh + - name: Job summary if: github.event_name != 'pull_request' run: | diff --git a/.github/workflows/backend_pr.yml b/.github/workflows/backend_pr.yml index 9b0aba310..e9520a548 100644 --- a/.github/workflows/backend_pr.yml +++ b/.github/workflows/backend_pr.yml @@ -104,7 +104,9 @@ jobs: # backend_merge.yml's push-side steps are all gated on # github.event_name != 'pull_request', so on a PR the merge job would # do nothing. Skip it entirely to avoid spinning up an empty runner. - if: github.event_name != 'pull_request' && needs.generate-matrix.outputs['has-merges-multiarch'] == 'true' + # !cancelled() lets the merge run even when a few build legs fail — + # see the matching note in backend.yml. + if: ${{ !cancelled() && github.event_name != 'pull_request' && needs.generate-matrix.outputs['has-merges-multiarch'] == 'true' }} uses: ./.github/workflows/backend_merge.yml with: tag-latest: ${{ matrix.tag-latest }} @@ -118,7 +120,7 @@ jobs: backend-merge-jobs-singlearch: needs: [generate-matrix, backend-jobs-singlearch] - if: github.event_name != 'pull_request' && needs.generate-matrix.outputs['has-merges-singlearch'] == 'true' + if: ${{ !cancelled() && github.event_name != 'pull_request' && needs.generate-matrix.outputs['has-merges-singlearch'] == 'true' }} uses: ./.github/workflows/backend_merge.yml with: tag-latest: ${{ matrix.tag-latest }}