# Builds the llama-cpp backend on top of the shared # .docker/bases/Dockerfile.cpp base. The base bakes in apt + GPU SDK + # protoc + cmake + GRPC, so this stage only carries the COPY + `make` # invocations and the final scratch-stage package. # # CI orchestration (.github/workflows/backend.yml + backend_pr.yml) passes # BASE_IMAGE_PREBUILT. See .agents/ci-caching.md. ARG BASE_IMAGE_PREBUILT FROM ${BASE_IMAGE_PREBUILT} AS builder # We can target specific CUDA ARCHITECTURES like --build-arg CUDA_DOCKER_ARCH='75;86;89;120' ARG CUDA_DOCKER_ARCH ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} ARG CMAKE_ARGS ENV CMAKE_ARGS=${CMAKE_ARGS} ARG AMDGPU_TARGETS ENV AMDGPU_TARGETS=${AMDGPU_TARGETS} ARG BACKEND=llama-cpp ARG BUILD_TYPE ENV BUILD_TYPE=${BUILD_TYPE} ARG CUDA_MAJOR_VERSION ARG CUDA_MINOR_VERSION ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} ARG TARGETARCH ARG TARGETVARIANT COPY . /LocalAI RUN <<'EOT' bash set -euxo pipefail if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}" export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}" echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}" rm -rf /LocalAI/backend/cpp/llama-cpp-*-build fi if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then cd /LocalAI/backend/cpp/llama-cpp make llama-cpp-fallback make llama-cpp-grpc make llama-cpp-rpc-server else cd /LocalAI/backend/cpp/llama-cpp make llama-cpp-avx make llama-cpp-avx2 make llama-cpp-avx512 make llama-cpp-fallback make llama-cpp-grpc make llama-cpp-rpc-server fi EOT # Copy libraries using a script to handle architecture differences RUN make -BC /LocalAI/backend/cpp/llama-cpp package FROM scratch # Copy all available binaries (the build process only creates the appropriate ones for the target architecture) COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./