ci(ik-llama-cpp,turboquant): add BuildKit ccache mount to compile steps

Mirror the ccache mount added to Dockerfile.llama-cpp in 9228e5b4 for the other two llama.cpp-derived backends. Same shape, distinct mount ids so each backend's cache is independent: ik-llama-cpp-ccache-${TARGETARCH}-${BUILD_TYPE} turboquant-ccache-${TARGETARCH}-${BUILD_TYPE} ik_llama.cpp is a different upstream fork; no source overlap with llama-cpp, separate cache makes sense. turboquant is a llama.cpp fork that reuses backend/cpp/llama-cpp source via a thin wrapper Makefile — most TUs would in principle hit llama-cpp's ccache too. Keeping them separate for now to avoid one fork's regressions poisoning the other; revisit sharing after we have hit-rate numbers. Same registry-export behavior as llama-cpp: the cache mount rides on backend_build.yml's existing cache-to: type=registry,mode=max. Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-16 20:52:08 -04:00 · 2026-05-09 16:21:49 +00:00
parent 3568b2819d
commit 31aa0582a5
2 changed files with 31 additions and 4 deletions
--- a/backend/Dockerfile.ik-llama-cpp
+++ b/backend/Dockerfile.ik-llama-cpp
@@ -259,12 +259,22 @@ COPY --from=grpc /opt/grpc /usr/local

 COPY . /LocalAI

-RUN <<'EOT' bash
+# BuildKit cache mount for ccache. See Dockerfile.llama-cpp (commit 9228e5b4)
+# for the rationale. Distinct mount id so ik-llama-cpp's cache doesn't
+# overlap with llama-cpp's — ik_llama.cpp is a different fork with
+# different source.
+RUN --mount=type=cache,target=/root/.ccache,id=ik-llama-cpp-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked <<'EOT' bash
 set -euxo pipefail

+export CCACHE_DIR=/root/.ccache
+ccache --max-size=5G || true
+ccache -z || true
+
+export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
+
 if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
-  export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
+  export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
  rm -rf /LocalAI/backend/cpp/ik-llama-cpp-*-build
 fi
@@ -278,6 +288,8 @@ else
  # ik_llama.cpp's IQK kernels require at least AVX2
  make ik-llama-cpp-avx2
 fi
+
+ccache -s || true
 EOT


--- a/backend/Dockerfile.turboquant
+++ b/backend/Dockerfile.turboquant
@@ -263,12 +263,25 @@ COPY --from=grpc /opt/grpc /usr/local

 COPY . /LocalAI

-RUN <<'EOT' bash
+# BuildKit cache mount for ccache. See Dockerfile.llama-cpp (commit 9228e5b4)
+# for rationale. turboquant is a llama.cpp fork that reuses
+# backend/cpp/llama-cpp source via a thin wrapper Makefile, so MOST TUs
+# are content-identical to the upstream llama-cpp build. Sharing a cache
+# id with llama-cpp could give cross-fork hits — but for now keep them
+# separate so a regression in one doesn't poison the other. Revisit
+# sharing after measuring the actual hit rate.
+RUN --mount=type=cache,target=/root/.ccache,id=turboquant-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked <<'EOT' bash
 set -euxo pipefail

+export CCACHE_DIR=/root/.ccache
+ccache --max-size=5G || true
+ccache -z || true
+
+export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
+
 if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
-  export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
+  export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
  rm -rf /LocalAI/backend/cpp/turboquant-*-build
 fi
@@ -287,6 +300,8 @@ else
  make turboquant-grpc
  make turboquant-rpc-server
 fi
+
+ccache -s || true
 EOT