diff --git a/.docker/llama-cpp-compile.sh b/.docker/llama-cpp-compile.sh index 2d6ed1803..b4791a348 100755 --- a/.docker/llama-cpp-compile.sh +++ b/.docker/llama-cpp-compile.sh @@ -17,18 +17,17 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then rm -rf /LocalAI/backend/cpp/llama-cpp-*-build fi -if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then - cd /LocalAI/backend/cpp/llama-cpp +cd /LocalAI/backend/cpp/llama-cpp +if [ "${BUILD_TYPE}" = "hipblas" ]; then + # ROCm: the GPU does the compute, so a single fallback CPU build is enough. make llama-cpp-fallback - make llama-cpp-grpc - make llama-cpp-rpc-server else - cd /LocalAI/backend/cpp/llama-cpp - # x86: single build with ggml CPU_ALL_VARIANTS replaces the avx/avx2/avx512/fallback - # set. ggml selects the right libggml-cpu-*.so at runtime by probing host CPU features. + # x86 and arm64: one build with ggml CPU_ALL_VARIANTS replaces the per-microarch + # binaries (x86: avx/avx2/avx512/fallback; arm64: armv8.x/armv9.x). ggml dlopens the + # best libggml-cpu-*.so at runtime by probing host CPU features. make llama-cpp-cpu-all - make llama-cpp-grpc - make llama-cpp-rpc-server fi +make llama-cpp-grpc +make llama-cpp-rpc-server ccache -s || true diff --git a/.docker/turboquant-compile.sh b/.docker/turboquant-compile.sh index 7468bc1a7..c1a970010 100755 --- a/.docker/turboquant-compile.sh +++ b/.docker/turboquant-compile.sh @@ -19,17 +19,14 @@ fi cd /LocalAI/backend/cpp/turboquant -if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then +if [ "${BUILD_TYPE}" = "hipblas" ]; then + # ROCm: single fallback CPU build (GPU does the compute). make turboquant-fallback - make turboquant-grpc - make turboquant-rpc-server else - make turboquant-avx - make turboquant-avx2 - make turboquant-avx512 - make turboquant-fallback - make turboquant-grpc - make turboquant-rpc-server + # x86 and arm64: one ggml CPU_ALL_VARIANTS build replaces the per-microarch binaries. + make turboquant-cpu-all fi +make turboquant-grpc +make turboquant-rpc-server ccache -s || true diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 89868c054..8234680e2 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -149,7 +149,7 @@ llama-cpp-cpu-all: llama.cpp $(MAKE) SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" VARIANT="llama-cpp-cpu-all-build" build-llama-cpp-grpc-server cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/grpc-server llama-cpp-cpu-all rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs - find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/llama.cpp/build -name '*.so*' -exec cp -av {} ggml-shared-libs/ \; + find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \; @echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/ llama-cpp-grpc: llama.cpp diff --git a/backend/cpp/turboquant/Makefile b/backend/cpp/turboquant/Makefile index 98f5e4978..a32adf0b6 100644 --- a/backend/cpp/turboquant/Makefile +++ b/backend/cpp/turboquant/Makefile @@ -65,6 +65,29 @@ turboquant-avx: turboquant-fallback: $(call turboquant-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server) +# Single-build CPU backend via ggml CPU_ALL_VARIANTS (mirrors llama-cpp-cpu-all). +# turboquant reuses backend/cpp/llama-cpp's CMakeLists.txt (hw_grpc_proto STATIC) and +# Makefile (SHARED_LIBS make-var + EXTRA_CMAKE_ARGS), so this passes the same overrides +# through to the copied build: SHARED_LIBS=ON, the DL flags, and --target ggml (which +# pulls in the per-microarch libggml-cpu-*.so via ggml's add_dependencies). The .so set +# is collected for package.sh to bundle into package/lib. +turboquant-cpu-all: + rm -rf $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build + cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build purge + bash $(CURRENT_MAKEFILE_DIR)/patch-grpc-server.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server.cpp + $(info $(GREEN)I turboquant build info:cpu-all-variants$(RESET)) + LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \ + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build llama.cpp + bash $(CURRENT_MAKEFILE_DIR)/apply-patches.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp $(PATCHES_DIR) + SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" \ + LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \ + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server turboquant-cpu-all + rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs + find $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \; + @echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/ + turboquant-grpc: $(call turboquant-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target rpc-server) diff --git a/backend/cpp/turboquant/package.sh b/backend/cpp/turboquant/package.sh index d5402fc31..c4559a68d 100755 --- a/backend/cpp/turboquant/package.sh +++ b/backend/cpp/turboquant/package.sh @@ -14,6 +14,15 @@ mkdir -p $CURDIR/package/lib cp -avrf $CURDIR/turboquant-* $CURDIR/package/ cp -rfv $CURDIR/run.sh $CURDIR/package/ +# Bundle the ggml shared backends from the CPU_ALL_VARIANTS build into package/lib. ggml +# discovers the per-microarch libggml-cpu-*.so by scanning the executable directory, which +# (via the bundled lib/ld.so that run.sh launches through) resolves to lib/. See the +# matching comment in backend/cpp/llama-cpp/package.sh. No-op on the fallback/ROCm builds. +if [ -d "$CURDIR/ggml-shared-libs" ]; then + echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..." + cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/ +fi + # Detect architecture and copy appropriate libraries if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then # x86_64 architecture diff --git a/backend/cpp/turboquant/run.sh b/backend/cpp/turboquant/run.sh index b0239e237..cd41a0f7f 100755 --- a/backend/cpp/turboquant/run.sh +++ b/backend/cpp/turboquant/run.sh @@ -12,26 +12,11 @@ grep -e "flags" /proc/cpuinfo | head -1 BINARY=turboquant-fallback -if grep -q -e "\savx\s" /proc/cpuinfo ; then - echo "CPU: AVX found OK" - if [ -e $CURDIR/turboquant-avx ]; then - BINARY=turboquant-avx - fi -fi - -if grep -q -e "\savx2\s" /proc/cpuinfo ; then - echo "CPU: AVX2 found OK" - if [ -e $CURDIR/turboquant-avx2 ]; then - BINARY=turboquant-avx2 - fi -fi - -# Check avx 512 -if grep -q -e "\savx512f\s" /proc/cpuinfo ; then - echo "CPU: AVX512F found OK" - if [ -e $CURDIR/turboquant-avx512 ]; then - BINARY=turboquant-avx512 - fi +# x86/arm64 ship a single turboquant-cpu-all built with ggml CPU_ALL_VARIANTS: ggml's +# backend registry dlopens the best libggml-cpu-*.so for this host, so no shell-side +# probing. ROCm ships only turboquant-fallback, so fall back to it when cpu-all is absent. +if [ -e $CURDIR/turboquant-cpu-all ]; then + BINARY=turboquant-cpu-all fi if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then