diff --git a/.docker/llama-cpp-compile.sh b/.docker/llama-cpp-compile.sh index bbc9aa21f..2d6ed1803 100755 --- a/.docker/llama-cpp-compile.sh +++ b/.docker/llama-cpp-compile.sh @@ -24,10 +24,9 @@ if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then make llama-cpp-rpc-server else cd /LocalAI/backend/cpp/llama-cpp - make llama-cpp-avx - make llama-cpp-avx2 - make llama-cpp-avx512 - make llama-cpp-fallback + # x86: single build with ggml CPU_ALL_VARIANTS replaces the avx/avx2/avx512/fallback + # set. ggml selects the right libggml-cpu-*.so at runtime by probing host CPU features. + make llama-cpp-cpu-all make llama-cpp-grpc make llama-cpp-rpc-server fi diff --git a/backend/cpp/llama-cpp/CMakeLists.txt b/backend/cpp/llama-cpp/CMakeLists.txt index cb1f5298c..bdf20802a 100644 --- a/backend/cpp/llama-cpp/CMakeLists.txt +++ b/backend/cpp/llama-cpp/CMakeLists.txt @@ -50,8 +50,13 @@ add_custom_command( "${hw_proto}" DEPENDS "${hw_proto}") -# hw_grpc_proto -add_library(hw_grpc_proto +# hw_grpc_proto: force STATIC. Under the CPU_ALL_VARIANTS build BUILD_SHARED_LIBS=ON +# (ggml/llama become shared), which would otherwise make this glue library a DSO. As a +# DSO it references the hidden-visibility symbols in the static libprotobuf.a, which the +# linker cannot satisfy ("hidden symbol ... in libprotobuf.a is referenced by DSO"). +# Keeping it STATIC links protobuf/gRPC directly into the grpc-server executable while +# only ggml/llama stay shared. No effect on the static variants (already BUILD_SHARED_LIBS=OFF). +add_library(hw_grpc_proto STATIC ${hw_grpc_srcs} ${hw_grpc_hdrs} ${hw_proto_srcs} diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 24f1f215d..89868c054 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -10,8 +10,16 @@ TARGET?=--target grpc-server JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1) ARCH?=$(shell uname -m) -# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static -CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF +# Shared libs default to OFF: we link static gRPC and the avx/avx2/avx512/fallback +# variants are fully static. The CPU_ALL_VARIANTS build flips SHARED_LIBS=ON (ggml/llama +# become shared so the dynamic CPU backends work; gRPC stays static via its imported +# targets). SHARED_LIBS is a make variable, not an appended -D, so it survives the +# recursive sub-make into the VARIANT build dir (which re-parses this Makefile) instead +# of being re-clobbered by a second -DBUILD_SHARED_LIBS=OFF. EXTRA_CMAKE_ARGS is the hook +# the CPU_ALL_VARIANTS target uses to inject -DGGML_BACKEND_DL/-DGGML_CPU_ALL_VARIANTS. +SHARED_LIBS?=OFF +EXTRA_CMAKE_ARGS?= +CMAKE_ARGS+=-DBUILD_SHARED_LIBS=$(SHARED_LIBS) -DLLAMA_CURL=OFF $(EXTRA_CMAKE_ARGS) CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) ifeq ($(NATIVE),false) @@ -120,6 +128,30 @@ llama-cpp-fallback: llama.cpp CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback +# Single-build CPU backend using ggml's CPU_ALL_VARIANTS. Produces ONE grpc-server +# plus a set of dlopen-able libggml-cpu-*.so (sandybridge/haswell/skylakex/...) that +# ggml's backend registry selects from at runtime by probing host CPU features. +# Replaces the avx/avx2/avx512/fallback multi-binary build on x86. +# +# CPU_ALL_VARIANTS requires GGML_BACKEND_DL, which requires BUILD_SHARED_LIBS=ON, so we +# pass SHARED_LIBS=ON and the DL flags as make variables (NOT pre-expanded into the +# CMAKE_ARGS env string): command-line make variables propagate through every recursive +# sub-make, so the deepest VARIANT-dir build computes BUILD_SHARED_LIBS=ON consistently. +# Only ggml/llama go shared - gRPC is found via its static imported targets, so the +# grpc-server binary keeps static gRPC and only dynamically links ggml. +# +# TARGET adds "ggml": the per-microarch backends are runtime-dlopened, not link deps of +# grpc-server, so they only build because each is an add_dependencies() of the ggml target. +llama-cpp-cpu-all: llama.cpp + cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build purge + $(info ${GREEN}I llama-cpp build info:cpu-all-variants${RESET}) + $(MAKE) SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" VARIANT="llama-cpp-cpu-all-build" build-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/grpc-server llama-cpp-cpu-all + rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs + find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/llama.cpp/build -name '*.so*' -exec cp -av {} ggml-shared-libs/ \; + @echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/ + llama-cpp-grpc: llama.cpp cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge diff --git a/backend/cpp/llama-cpp/package.sh b/backend/cpp/llama-cpp/package.sh index d1897e6be..5d2b18c5b 100755 --- a/backend/cpp/llama-cpp/package.sh +++ b/backend/cpp/llama-cpp/package.sh @@ -14,6 +14,22 @@ mkdir -p $CURDIR/package/lib cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/ cp -rfv $CURDIR/run.sh $CURDIR/package/ +# Bundle the ggml shared backends produced by the CPU_ALL_VARIANTS build (libggml-base.so, +# libggml.so, libllama.so and the per-microarch libggml-cpu-*.so), all into package/lib. +# +# Two distinct resolution mechanisms both land here: +# - NEEDED deps (libggml-base/libggml/libllama): resolved by the dynamic linker via the +# LD_LIBRARY_PATH=$CURDIR/lib that run.sh exports. +# - The per-microarch libggml-cpu-*.so are NOT linked; ggml *discovers* them at runtime by +# scanning the executable's own directory (readlink /proc/self/exe). run.sh launches via +# the bundled $CURDIR/lib/ld.so, so /proc/self/exe -> .../lib/ld.so and ggml scans lib/. +# That is why the variants must sit in lib/ (next to ld.so), not just on the link path. +# No-op on builds (arm64/darwin) that don't produce the all-variants set. +if [ -d "$CURDIR/ggml-shared-libs" ]; then + echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..." + cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/ +fi + # Detect architecture and copy appropriate libraries if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then # x86_64 architecture diff --git a/backend/cpp/llama-cpp/run.sh b/backend/cpp/llama-cpp/run.sh index 553faeb27..f3f289495 100755 --- a/backend/cpp/llama-cpp/run.sh +++ b/backend/cpp/llama-cpp/run.sh @@ -12,26 +12,11 @@ grep -e "flags" /proc/cpuinfo | head -1 BINARY=llama-cpp-fallback -if grep -q -e "\savx\s" /proc/cpuinfo ; then - echo "CPU: AVX found OK" - if [ -e $CURDIR/llama-cpp-avx ]; then - BINARY=llama-cpp-avx - fi -fi - -if grep -q -e "\savx2\s" /proc/cpuinfo ; then - echo "CPU: AVX2 found OK" - if [ -e $CURDIR/llama-cpp-avx2 ]; then - BINARY=llama-cpp-avx2 - fi -fi - -# Check avx 512 -if grep -q -e "\savx512f\s" /proc/cpuinfo ; then - echo "CPU: AVX512F found OK" - if [ -e $CURDIR/llama-cpp-avx512 ]; then - BINARY=llama-cpp-avx512 - fi +# x86 ships a single llama-cpp-cpu-all built with ggml CPU_ALL_VARIANTS: ggml's backend +# registry dlopens the best libggml-cpu-*.so for this host, so no shell-side AVX probing. +# arm64/darwin builds ship only llama-cpp-fallback, so fall back to it when cpu-all absent. +if [ -e $CURDIR/llama-cpp-cpu-all ]; then + BINARY=llama-cpp-cpu-all fi if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then