#!/usr/bin/env bash
# Shared compile logic for backend/Dockerfile.llama-cpp.
# Sourced (via bind mount) from both builder-fromsource and builder-prebuilt stages.

set -euxo pipefail

export CCACHE_DIR=/root/.ccache
ccache --max-size=5G || true
ccache -z || true

export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"

if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
  export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
  rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
fi

cd /LocalAI/backend/cpp/llama-cpp
if [ -z "${BUILD_TYPE:-}" ]; then
  # Pure CPU image (BUILD_TYPE empty): one build with ggml CPU_ALL_VARIANTS replaces the
  # per-microarch binaries (x86: avx/avx2/avx512/fallback; arm64: armv8.x/armv9.x). ggml
  # dlopens the best libggml-cpu-*.so at runtime by probing host CPU features.
  #
  # arm64: the CPU_ALL_VARIANTS table includes armv9.2 SME variants whose -march=...+sme is
  # rejected by the Ubuntu 24.04 default gcc-13. gcc-14 accepts it, so build the arm64
  # variants with it (the host never *selects* SME unless it has it, but every variant must
  # still compile).
  if [ "${TARGETARCH}" = "arm64" ]; then
    apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
    export CC=gcc-14 CXX=g++-14
  fi
  make llama-cpp-cpu-all
else
  # GPU build (cublas/hipblas/sycl/vulkan/...): the accelerator does the compute, so a
  # single fallback CPU build is enough - no per-microarch CPU variants needed. (This also
  # keeps the heavy GPU backend compile from also building the whole CPU variant matrix,
  # and avoids the gcc-14 apt step on GPU base images such as nvidia l4t.)
  make llama-cpp-fallback
fi
make llama-cpp-grpc
make llama-cpp-rpc-server

ccache -s || true