From e47c58656f22f5b37edea3d6baee8b43e9c59993 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 24 Jun 2026 21:21:03 +0000 Subject: [PATCH] feat(llama-cpp): single x86 CPU build via ggml CPU_ALL_VARIANTS Replace the per-microarch avx/avx2/avx512/fallback multi-binary build on x86 with a single grpc-server plus the dlopen-able libggml-cpu-*.so set that ggml's backend registry selects at runtime by probing host CPU features. One build instead of four, broader microarch coverage (adds alderlake AVX-VNNI, zen4 AVX512-BF16, sapphirerapids AMX), and the shell-side /proc/cpuinfo probing in run.sh goes away. Build/link notes: - CPU_ALL_VARIANTS requires GGML_BACKEND_DL + BUILD_SHARED_LIBS=ON, so ggml/llama become shared objects. SHARED_LIBS is now a make variable (default OFF) so the override survives the recursive sub-make into the VARIANT build dir instead of being re-clobbered by the base flags. - The cpu-all target also builds "--target ggml": the per-microarch backends are runtime-dlopened, not link deps, so they only compile via ggml's add_dependencies(). - hw_grpc_proto is pinned STATIC. Under BUILD_SHARED_LIBS=ON it would otherwise become a DSO referencing hidden-visibility symbols in the static libprotobuf.a, which fails to link ("hidden symbol ... is referenced by DSO"). Keeping it static links gRPC/protobuf into the executable while only ggml/llama stay shared, so no PIC or base-image change is required. - package.sh bundles the libggml-*.so set into package/lib; ggml finds them by scanning the bundled ld.so directory (/proc/self/exe), which run.sh launches from. Scope: x86 only. arm64/darwin keep the single fallback build. The ik-llama-cpp / turboquant forks and the other ggml C++ backends are unchanged; the same recipe applies but is out of scope here. Validated with a full docker build plus a live inference smoke test: the model loads, ggml selects the AVX512_BF16 variant on a Zen-class host, and tokens generate correctly. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] --- .docker/llama-cpp-compile.sh | 7 +++--- backend/cpp/llama-cpp/CMakeLists.txt | 9 +++++-- backend/cpp/llama-cpp/Makefile | 36 ++++++++++++++++++++++++++-- backend/cpp/llama-cpp/package.sh | 16 +++++++++++++ backend/cpp/llama-cpp/run.sh | 25 ++++--------------- 5 files changed, 65 insertions(+), 28 deletions(-) diff --git a/.docker/llama-cpp-compile.sh b/.docker/llama-cpp-compile.sh index bbc9aa21f..2d6ed1803 100755 --- a/.docker/llama-cpp-compile.sh +++ b/.docker/llama-cpp-compile.sh @@ -24,10 +24,9 @@ if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then make llama-cpp-rpc-server else cd /LocalAI/backend/cpp/llama-cpp - make llama-cpp-avx - make llama-cpp-avx2 - make llama-cpp-avx512 - make llama-cpp-fallback + # x86: single build with ggml CPU_ALL_VARIANTS replaces the avx/avx2/avx512/fallback + # set. ggml selects the right libggml-cpu-*.so at runtime by probing host CPU features. + make llama-cpp-cpu-all make llama-cpp-grpc make llama-cpp-rpc-server fi diff --git a/backend/cpp/llama-cpp/CMakeLists.txt b/backend/cpp/llama-cpp/CMakeLists.txt index cb1f5298c..bdf20802a 100644 --- a/backend/cpp/llama-cpp/CMakeLists.txt +++ b/backend/cpp/llama-cpp/CMakeLists.txt @@ -50,8 +50,13 @@ add_custom_command( "${hw_proto}" DEPENDS "${hw_proto}") -# hw_grpc_proto -add_library(hw_grpc_proto +# hw_grpc_proto: force STATIC. Under the CPU_ALL_VARIANTS build BUILD_SHARED_LIBS=ON +# (ggml/llama become shared), which would otherwise make this glue library a DSO. As a +# DSO it references the hidden-visibility symbols in the static libprotobuf.a, which the +# linker cannot satisfy ("hidden symbol ... in libprotobuf.a is referenced by DSO"). +# Keeping it STATIC links protobuf/gRPC directly into the grpc-server executable while +# only ggml/llama stay shared. No effect on the static variants (already BUILD_SHARED_LIBS=OFF). +add_library(hw_grpc_proto STATIC ${hw_grpc_srcs} ${hw_grpc_hdrs} ${hw_proto_srcs} diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 24f1f215d..89868c054 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -10,8 +10,16 @@ TARGET?=--target grpc-server JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1) ARCH?=$(shell uname -m) -# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static -CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF +# Shared libs default to OFF: we link static gRPC and the avx/avx2/avx512/fallback +# variants are fully static. The CPU_ALL_VARIANTS build flips SHARED_LIBS=ON (ggml/llama +# become shared so the dynamic CPU backends work; gRPC stays static via its imported +# targets). SHARED_LIBS is a make variable, not an appended -D, so it survives the +# recursive sub-make into the VARIANT build dir (which re-parses this Makefile) instead +# of being re-clobbered by a second -DBUILD_SHARED_LIBS=OFF. EXTRA_CMAKE_ARGS is the hook +# the CPU_ALL_VARIANTS target uses to inject -DGGML_BACKEND_DL/-DGGML_CPU_ALL_VARIANTS. +SHARED_LIBS?=OFF +EXTRA_CMAKE_ARGS?= +CMAKE_ARGS+=-DBUILD_SHARED_LIBS=$(SHARED_LIBS) -DLLAMA_CURL=OFF $(EXTRA_CMAKE_ARGS) CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) ifeq ($(NATIVE),false) @@ -120,6 +128,30 @@ llama-cpp-fallback: llama.cpp CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback +# Single-build CPU backend using ggml's CPU_ALL_VARIANTS. Produces ONE grpc-server +# plus a set of dlopen-able libggml-cpu-*.so (sandybridge/haswell/skylakex/...) that +# ggml's backend registry selects from at runtime by probing host CPU features. +# Replaces the avx/avx2/avx512/fallback multi-binary build on x86. +# +# CPU_ALL_VARIANTS requires GGML_BACKEND_DL, which requires BUILD_SHARED_LIBS=ON, so we +# pass SHARED_LIBS=ON and the DL flags as make variables (NOT pre-expanded into the +# CMAKE_ARGS env string): command-line make variables propagate through every recursive +# sub-make, so the deepest VARIANT-dir build computes BUILD_SHARED_LIBS=ON consistently. +# Only ggml/llama go shared - gRPC is found via its static imported targets, so the +# grpc-server binary keeps static gRPC and only dynamically links ggml. +# +# TARGET adds "ggml": the per-microarch backends are runtime-dlopened, not link deps of +# grpc-server, so they only build because each is an add_dependencies() of the ggml target. +llama-cpp-cpu-all: llama.cpp + cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build purge + $(info ${GREEN}I llama-cpp build info:cpu-all-variants${RESET}) + $(MAKE) SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" VARIANT="llama-cpp-cpu-all-build" build-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/grpc-server llama-cpp-cpu-all + rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs + find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/llama.cpp/build -name '*.so*' -exec cp -av {} ggml-shared-libs/ \; + @echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/ + llama-cpp-grpc: llama.cpp cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge diff --git a/backend/cpp/llama-cpp/package.sh b/backend/cpp/llama-cpp/package.sh index d1897e6be..5d2b18c5b 100755 --- a/backend/cpp/llama-cpp/package.sh +++ b/backend/cpp/llama-cpp/package.sh @@ -14,6 +14,22 @@ mkdir -p $CURDIR/package/lib cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/ cp -rfv $CURDIR/run.sh $CURDIR/package/ +# Bundle the ggml shared backends produced by the CPU_ALL_VARIANTS build (libggml-base.so, +# libggml.so, libllama.so and the per-microarch libggml-cpu-*.so), all into package/lib. +# +# Two distinct resolution mechanisms both land here: +# - NEEDED deps (libggml-base/libggml/libllama): resolved by the dynamic linker via the +# LD_LIBRARY_PATH=$CURDIR/lib that run.sh exports. +# - The per-microarch libggml-cpu-*.so are NOT linked; ggml *discovers* them at runtime by +# scanning the executable's own directory (readlink /proc/self/exe). run.sh launches via +# the bundled $CURDIR/lib/ld.so, so /proc/self/exe -> .../lib/ld.so and ggml scans lib/. +# That is why the variants must sit in lib/ (next to ld.so), not just on the link path. +# No-op on builds (arm64/darwin) that don't produce the all-variants set. +if [ -d "$CURDIR/ggml-shared-libs" ]; then + echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..." + cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/ +fi + # Detect architecture and copy appropriate libraries if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then # x86_64 architecture diff --git a/backend/cpp/llama-cpp/run.sh b/backend/cpp/llama-cpp/run.sh index 553faeb27..f3f289495 100755 --- a/backend/cpp/llama-cpp/run.sh +++ b/backend/cpp/llama-cpp/run.sh @@ -12,26 +12,11 @@ grep -e "flags" /proc/cpuinfo | head -1 BINARY=llama-cpp-fallback -if grep -q -e "\savx\s" /proc/cpuinfo ; then - echo "CPU: AVX found OK" - if [ -e $CURDIR/llama-cpp-avx ]; then - BINARY=llama-cpp-avx - fi -fi - -if grep -q -e "\savx2\s" /proc/cpuinfo ; then - echo "CPU: AVX2 found OK" - if [ -e $CURDIR/llama-cpp-avx2 ]; then - BINARY=llama-cpp-avx2 - fi -fi - -# Check avx 512 -if grep -q -e "\savx512f\s" /proc/cpuinfo ; then - echo "CPU: AVX512F found OK" - if [ -e $CURDIR/llama-cpp-avx512 ]; then - BINARY=llama-cpp-avx512 - fi +# x86 ships a single llama-cpp-cpu-all built with ggml CPU_ALL_VARIANTS: ggml's backend +# registry dlopens the best libggml-cpu-*.so for this host, so no shell-side AVX probing. +# arm64/darwin builds ship only llama-cpp-fallback, so fall back to it when cpu-all absent. +if [ -e $CURDIR/llama-cpp-cpu-all ]; then + BINARY=llama-cpp-cpu-all fi if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then