feat(llama-cpp,turboquant): extend CPU_ALL_VARIANTS to arm64 + turboquant

- llama-cpp: x86 AND arm64 now use the single llama-cpp-cpu-all build
  (only hipblas keeps the fallback build). ggml's arm64 variant table
  (armv8.x / armv9.x, plus apple_m* on darwin) is selected at runtime.
- turboquant: same recipe via a turboquant-cpu-all target. turboquant
  copies backend/cpp/llama-cpp's CMakeLists.txt + Makefile per flavor, so
  the hw_grpc_proto STATIC fix and the SHARED_LIBS / EXTRA_CMAKE_ARGS
  make-vars are inherited; the target just passes SHARED_LIBS=ON, the DL
  flags and --target ggml through, then collects the .so set. run.sh and
  package.sh updated to ship/select turboquant-cpu-all.
- Makefile lib-collection find now also matches *.dylib (for the darwin
  build, which emits dylibs rather than .so).

ik-llama-cpp is intentionally left unchanged: its pinned ggml has no
CPU_ALL_VARIANTS support and its IQK kernels require AVX2, so the
per-microarch dynamic backend set does not apply.

Scope still excludes the darwin packaging wiring (separate change).

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]
This commit is contained in:
Ettore Di Giacinto
2026-06-24 21:33:32 +00:00
parent e47c58656f
commit 379fa3e525
6 changed files with 52 additions and 39 deletions

View File

@@ -149,7 +149,7 @@ llama-cpp-cpu-all: llama.cpp
$(MAKE) SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" VARIANT="llama-cpp-cpu-all-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/grpc-server llama-cpp-cpu-all
rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/llama.cpp/build -name '*.so*' -exec cp -av {} ggml-shared-libs/ \;
find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
llama-cpp-grpc: llama.cpp

View File

@@ -65,6 +65,29 @@ turboquant-avx:
turboquant-fallback:
$(call turboquant-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)
# Single-build CPU backend via ggml CPU_ALL_VARIANTS (mirrors llama-cpp-cpu-all).
# turboquant reuses backend/cpp/llama-cpp's CMakeLists.txt (hw_grpc_proto STATIC) and
# Makefile (SHARED_LIBS make-var + EXTRA_CMAKE_ARGS), so this passes the same overrides
# through to the copied build: SHARED_LIBS=ON, the DL flags, and --target ggml (which
# pulls in the per-microarch libggml-cpu-*.so via ggml's add_dependencies). The .so set
# is collected for package.sh to bundle into package/lib.
turboquant-cpu-all:
rm -rf $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build purge
bash $(CURRENT_MAKEFILE_DIR)/patch-grpc-server.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server.cpp
$(info $(GREEN)I turboquant build info:cpu-all-variants$(RESET))
LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build llama.cpp
bash $(CURRENT_MAKEFILE_DIR)/apply-patches.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp $(PATCHES_DIR)
SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" \
LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server turboquant-cpu-all
rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
find $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
turboquant-grpc:
$(call turboquant-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target rpc-server)

View File

@@ -14,6 +14,15 @@ mkdir -p $CURDIR/package/lib
cp -avrf $CURDIR/turboquant-* $CURDIR/package/
cp -rfv $CURDIR/run.sh $CURDIR/package/
# Bundle the ggml shared backends from the CPU_ALL_VARIANTS build into package/lib. ggml
# discovers the per-microarch libggml-cpu-*.so by scanning the executable directory, which
# (via the bundled lib/ld.so that run.sh launches through) resolves to lib/. See the
# matching comment in backend/cpp/llama-cpp/package.sh. No-op on the fallback/ROCm builds.
if [ -d "$CURDIR/ggml-shared-libs" ]; then
echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
fi
# Detect architecture and copy appropriate libraries
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
# x86_64 architecture

View File

@@ -12,26 +12,11 @@ grep -e "flags" /proc/cpuinfo | head -1
BINARY=turboquant-fallback
if grep -q -e "\savx\s" /proc/cpuinfo ; then
echo "CPU: AVX found OK"
if [ -e $CURDIR/turboquant-avx ]; then
BINARY=turboquant-avx
fi
fi
if grep -q -e "\savx2\s" /proc/cpuinfo ; then
echo "CPU: AVX2 found OK"
if [ -e $CURDIR/turboquant-avx2 ]; then
BINARY=turboquant-avx2
fi
fi
# Check avx 512
if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
echo "CPU: AVX512F found OK"
if [ -e $CURDIR/turboquant-avx512 ]; then
BINARY=turboquant-avx512
fi
# x86/arm64 ship a single turboquant-cpu-all built with ggml CPU_ALL_VARIANTS: ggml's
# backend registry dlopens the best libggml-cpu-*.so for this host, so no shell-side
# probing. ROCm ships only turboquant-fallback, so fall back to it when cpu-all is absent.
if [ -e $CURDIR/turboquant-cpu-all ]; then
BINARY=turboquant-cpu-all
fi
if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then