diff --git a/.docker/llama-cpp-compile.sh b/.docker/llama-cpp-compile.sh index 776a2cceb..647a1c448 100755 --- a/.docker/llama-cpp-compile.sh +++ b/.docker/llama-cpp-compile.sh @@ -18,22 +18,26 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then fi cd /LocalAI/backend/cpp/llama-cpp -if [ "${BUILD_TYPE}" = "hipblas" ]; then - # ROCm: the GPU does the compute, so a single fallback CPU build is enough. - make llama-cpp-fallback -else - # arm64: ggml's CPU_ALL_VARIANTS table includes armv9.2 SME variants whose - # -march=...+sme is rejected by the Ubuntu 24.04 default gcc-13. gcc-14 accepts it, so - # build the arm64 variants with gcc-14 (the host never *selects* SME unless it has it, - # but every variant must still compile). +if [ -z "${BUILD_TYPE:-}" ]; then + # Pure CPU image (BUILD_TYPE empty): one build with ggml CPU_ALL_VARIANTS replaces the + # per-microarch binaries (x86: avx/avx2/avx512/fallback; arm64: armv8.x/armv9.x). ggml + # dlopens the best libggml-cpu-*.so at runtime by probing host CPU features. + # + # arm64: the CPU_ALL_VARIANTS table includes armv9.2 SME variants whose -march=...+sme is + # rejected by the Ubuntu 24.04 default gcc-13. gcc-14 accepts it, so build the arm64 + # variants with it (the host never *selects* SME unless it has it, but every variant must + # still compile). if [ "${TARGETARCH}" = "arm64" ]; then apt-get update -qq && apt-get install -y -qq gcc-14 g++-14 export CC=gcc-14 CXX=g++-14 fi - # x86 and arm64: one build with ggml CPU_ALL_VARIANTS replaces the per-microarch - # binaries (x86: avx/avx2/avx512/fallback; arm64: armv8.x/armv9.x). ggml dlopens the - # best libggml-cpu-*.so at runtime by probing host CPU features. make llama-cpp-cpu-all +else + # GPU build (cublas/hipblas/sycl/vulkan/...): the accelerator does the compute, so a + # single fallback CPU build is enough - no per-microarch CPU variants needed. (This also + # keeps the heavy GPU backend compile from also building the whole CPU variant matrix, + # and avoids the gcc-14 apt step on GPU base images such as nvidia l4t.) + make llama-cpp-fallback fi make llama-cpp-grpc make llama-cpp-rpc-server diff --git a/.docker/turboquant-compile.sh b/.docker/turboquant-compile.sh index baf814c7e..ca6cf2690 100755 --- a/.docker/turboquant-compile.sh +++ b/.docker/turboquant-compile.sh @@ -19,17 +19,19 @@ fi cd /LocalAI/backend/cpp/turboquant -if [ "${BUILD_TYPE}" = "hipblas" ]; then - # ROCm: single fallback CPU build (GPU does the compute). - make turboquant-fallback -else - # arm64: the CPU_ALL_VARIANTS armv9.2 SME variants need gcc-14 (gcc-13 rejects +sme). +if [ -z "${BUILD_TYPE:-}" ]; then + # Pure CPU image: one ggml CPU_ALL_VARIANTS build replaces the per-microarch binaries. + # arm64: the armv9.2 SME variants need gcc-14 (gcc-13 rejects +sme). if [ "${TARGETARCH}" = "arm64" ]; then apt-get update -qq && apt-get install -y -qq gcc-14 g++-14 export CC=gcc-14 CXX=g++-14 fi - # x86 and arm64: one ggml CPU_ALL_VARIANTS build replaces the per-microarch binaries. make turboquant-cpu-all +else + # GPU build (cublas/hipblas/sycl/vulkan/...): single fallback CPU build, the accelerator + # does the compute. Keeps the GPU compile from also building the CPU variant matrix and + # avoids the gcc-14 apt step on GPU base images such as nvidia l4t. + make turboquant-fallback fi make turboquant-grpc make turboquant-rpc-server