ci(ik-llama-cpp,turboquant): add BuildKit ccache mount to compile steps

Mirror the ccache mount added to Dockerfile.llama-cpp in 9228e5b4 for
the other two llama.cpp-derived backends. Same shape, distinct mount
ids so each backend's cache is independent:

  ik-llama-cpp-ccache-${TARGETARCH}-${BUILD_TYPE}
  turboquant-ccache-${TARGETARCH}-${BUILD_TYPE}

ik_llama.cpp is a different upstream fork; no source overlap with
llama-cpp, separate cache makes sense.

turboquant is a llama.cpp fork that reuses backend/cpp/llama-cpp
source via a thin wrapper Makefile — most TUs would in principle hit
llama-cpp's ccache too. Keeping them separate for now to avoid one
fork's regressions poisoning the other; revisit sharing after we have
hit-rate numbers.

Same registry-export behavior as llama-cpp: the cache mount rides on
backend_build.yml's existing cache-to: type=registry,mode=max.

Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-05-09 16:21:49 +00:00
parent 3568b2819d
commit 31aa0582a5
2 changed files with 31 additions and 4 deletions

View File

@@ -259,12 +259,22 @@ COPY --from=grpc /opt/grpc /usr/local
COPY . /LocalAI
RUN <<'EOT' bash
# BuildKit cache mount for ccache. See Dockerfile.llama-cpp (commit 9228e5b4)
# for the rationale. Distinct mount id so ik-llama-cpp's cache doesn't
# overlap with llama-cpp's — ik_llama.cpp is a different fork with
# different source.
RUN --mount=type=cache,target=/root/.ccache,id=ik-llama-cpp-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked <<'EOT' bash
set -euxo pipefail
export CCACHE_DIR=/root/.ccache
ccache --max-size=5G || true
ccache -z || true
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
rm -rf /LocalAI/backend/cpp/ik-llama-cpp-*-build
fi
@@ -278,6 +288,8 @@ else
# ik_llama.cpp's IQK kernels require at least AVX2
make ik-llama-cpp-avx2
fi
ccache -s || true
EOT

View File

@@ -263,12 +263,25 @@ COPY --from=grpc /opt/grpc /usr/local
COPY . /LocalAI
RUN <<'EOT' bash
# BuildKit cache mount for ccache. See Dockerfile.llama-cpp (commit 9228e5b4)
# for rationale. turboquant is a llama.cpp fork that reuses
# backend/cpp/llama-cpp source via a thin wrapper Makefile, so MOST TUs
# are content-identical to the upstream llama-cpp build. Sharing a cache
# id with llama-cpp could give cross-fork hits — but for now keep them
# separate so a regression in one doesn't poison the other. Revisit
# sharing after measuring the actual hit rate.
RUN --mount=type=cache,target=/root/.ccache,id=turboquant-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked <<'EOT' bash
set -euxo pipefail
export CCACHE_DIR=/root/.ccache
ccache --max-size=5G || true
ccache -z || true
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
rm -rf /LocalAI/backend/cpp/turboquant-*-build
fi
@@ -287,6 +300,8 @@ else
make turboquant-grpc
make turboquant-rpc-server
fi
ccache -s || true
EOT