mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-25 09:09:07 -04:00
Compare commits
6 Commits
master
...
feat/llama
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c23fc5fb42 | ||
|
|
292c1cab94 | ||
|
|
4e9bb4f879 | ||
|
|
3b47122e54 | ||
|
|
379fa3e525 | ||
|
|
e47c58656f |
@@ -17,19 +17,29 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
|
|||||||
rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
|
rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
|
cd /LocalAI/backend/cpp/llama-cpp
|
||||||
cd /LocalAI/backend/cpp/llama-cpp
|
if [ -z "${BUILD_TYPE:-}" ]; then
|
||||||
make llama-cpp-fallback
|
# Pure CPU image (BUILD_TYPE empty): one build with ggml CPU_ALL_VARIANTS replaces the
|
||||||
make llama-cpp-grpc
|
# per-microarch binaries (x86: avx/avx2/avx512/fallback; arm64: armv8.x/armv9.x). ggml
|
||||||
make llama-cpp-rpc-server
|
# dlopens the best libggml-cpu-*.so at runtime by probing host CPU features.
|
||||||
|
#
|
||||||
|
# arm64: the CPU_ALL_VARIANTS table includes armv9.2 SME variants whose -march=...+sme is
|
||||||
|
# rejected by the Ubuntu 24.04 default gcc-13. gcc-14 accepts it, so build the arm64
|
||||||
|
# variants with it (the host never *selects* SME unless it has it, but every variant must
|
||||||
|
# still compile).
|
||||||
|
if [ "${TARGETARCH}" = "arm64" ]; then
|
||||||
|
apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
|
||||||
|
export CC=gcc-14 CXX=g++-14
|
||||||
|
fi
|
||||||
|
make llama-cpp-cpu-all
|
||||||
else
|
else
|
||||||
cd /LocalAI/backend/cpp/llama-cpp
|
# GPU build (cublas/hipblas/sycl/vulkan/...): the accelerator does the compute, so a
|
||||||
make llama-cpp-avx
|
# single fallback CPU build is enough - no per-microarch CPU variants needed. (This also
|
||||||
make llama-cpp-avx2
|
# keeps the heavy GPU backend compile from also building the whole CPU variant matrix,
|
||||||
make llama-cpp-avx512
|
# and avoids the gcc-14 apt step on GPU base images such as nvidia l4t.)
|
||||||
make llama-cpp-fallback
|
make llama-cpp-fallback
|
||||||
make llama-cpp-grpc
|
|
||||||
make llama-cpp-rpc-server
|
|
||||||
fi
|
fi
|
||||||
|
make llama-cpp-grpc
|
||||||
|
make llama-cpp-rpc-server
|
||||||
|
|
||||||
ccache -s || true
|
ccache -s || true
|
||||||
|
|||||||
@@ -19,17 +19,21 @@ fi
|
|||||||
|
|
||||||
cd /LocalAI/backend/cpp/turboquant
|
cd /LocalAI/backend/cpp/turboquant
|
||||||
|
|
||||||
if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
|
if [ -z "${BUILD_TYPE:-}" ]; then
|
||||||
make turboquant-fallback
|
# Pure CPU image: one ggml CPU_ALL_VARIANTS build replaces the per-microarch binaries.
|
||||||
make turboquant-grpc
|
# arm64: the armv9.2 SME variants need gcc-14 (gcc-13 rejects +sme).
|
||||||
make turboquant-rpc-server
|
if [ "${TARGETARCH}" = "arm64" ]; then
|
||||||
|
apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
|
||||||
|
export CC=gcc-14 CXX=g++-14
|
||||||
|
fi
|
||||||
|
make turboquant-cpu-all
|
||||||
else
|
else
|
||||||
make turboquant-avx
|
# GPU build (cublas/hipblas/sycl/vulkan/...): single fallback CPU build, the accelerator
|
||||||
make turboquant-avx2
|
# does the compute. Keeps the GPU compile from also building the CPU variant matrix and
|
||||||
make turboquant-avx512
|
# avoids the gcc-14 apt step on GPU base images such as nvidia l4t.
|
||||||
make turboquant-fallback
|
make turboquant-fallback
|
||||||
make turboquant-grpc
|
|
||||||
make turboquant-rpc-server
|
|
||||||
fi
|
fi
|
||||||
|
make turboquant-grpc
|
||||||
|
make turboquant-rpc-server
|
||||||
|
|
||||||
ccache -s || true
|
ccache -s || true
|
||||||
|
|||||||
@@ -50,8 +50,13 @@ add_custom_command(
|
|||||||
"${hw_proto}"
|
"${hw_proto}"
|
||||||
DEPENDS "${hw_proto}")
|
DEPENDS "${hw_proto}")
|
||||||
|
|
||||||
# hw_grpc_proto
|
# hw_grpc_proto: force STATIC. Under the CPU_ALL_VARIANTS build BUILD_SHARED_LIBS=ON
|
||||||
add_library(hw_grpc_proto
|
# (ggml/llama become shared), which would otherwise make this glue library a DSO. As a
|
||||||
|
# DSO it references the hidden-visibility symbols in the static libprotobuf.a, which the
|
||||||
|
# linker cannot satisfy ("hidden symbol ... in libprotobuf.a is referenced by DSO").
|
||||||
|
# Keeping it STATIC links protobuf/gRPC directly into the grpc-server executable while
|
||||||
|
# only ggml/llama stay shared. No effect on the static variants (already BUILD_SHARED_LIBS=OFF).
|
||||||
|
add_library(hw_grpc_proto STATIC
|
||||||
${hw_grpc_srcs}
|
${hw_grpc_srcs}
|
||||||
${hw_grpc_hdrs}
|
${hw_grpc_hdrs}
|
||||||
${hw_proto_srcs}
|
${hw_proto_srcs}
|
||||||
|
|||||||
@@ -10,8 +10,16 @@ TARGET?=--target grpc-server
|
|||||||
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
|
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
|
||||||
ARCH?=$(shell uname -m)
|
ARCH?=$(shell uname -m)
|
||||||
|
|
||||||
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
|
# Shared libs default to OFF: we link static gRPC and the avx/avx2/avx512/fallback
|
||||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
|
# variants are fully static. The CPU_ALL_VARIANTS build flips SHARED_LIBS=ON (ggml/llama
|
||||||
|
# become shared so the dynamic CPU backends work; gRPC stays static via its imported
|
||||||
|
# targets). SHARED_LIBS is a make variable, not an appended -D, so it survives the
|
||||||
|
# recursive sub-make into the VARIANT build dir (which re-parses this Makefile) instead
|
||||||
|
# of being re-clobbered by a second -DBUILD_SHARED_LIBS=OFF. EXTRA_CMAKE_ARGS is the hook
|
||||||
|
# the CPU_ALL_VARIANTS target uses to inject -DGGML_BACKEND_DL/-DGGML_CPU_ALL_VARIANTS.
|
||||||
|
SHARED_LIBS?=OFF
|
||||||
|
EXTRA_CMAKE_ARGS?=
|
||||||
|
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=$(SHARED_LIBS) -DLLAMA_CURL=OFF $(EXTRA_CMAKE_ARGS)
|
||||||
|
|
||||||
CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||||
ifeq ($(NATIVE),false)
|
ifeq ($(NATIVE),false)
|
||||||
@@ -120,6 +128,30 @@ llama-cpp-fallback: llama.cpp
|
|||||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
|
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
|
||||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
|
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
|
||||||
|
|
||||||
|
# Single-build CPU backend using ggml's CPU_ALL_VARIANTS. Produces ONE grpc-server
|
||||||
|
# plus a set of dlopen-able libggml-cpu-*.so (sandybridge/haswell/skylakex/...) that
|
||||||
|
# ggml's backend registry selects from at runtime by probing host CPU features.
|
||||||
|
# Replaces the avx/avx2/avx512/fallback multi-binary build on x86.
|
||||||
|
#
|
||||||
|
# CPU_ALL_VARIANTS requires GGML_BACKEND_DL, which requires BUILD_SHARED_LIBS=ON, so we
|
||||||
|
# pass SHARED_LIBS=ON and the DL flags as make variables (NOT pre-expanded into the
|
||||||
|
# CMAKE_ARGS env string): command-line make variables propagate through every recursive
|
||||||
|
# sub-make, so the deepest VARIANT-dir build computes BUILD_SHARED_LIBS=ON consistently.
|
||||||
|
# Only ggml/llama go shared - gRPC is found via its static imported targets, so the
|
||||||
|
# grpc-server binary keeps static gRPC and only dynamically links ggml.
|
||||||
|
#
|
||||||
|
# TARGET adds "ggml": the per-microarch backends are runtime-dlopened, not link deps of
|
||||||
|
# grpc-server, so they only build because each is an add_dependencies() of the ggml target.
|
||||||
|
llama-cpp-cpu-all: llama.cpp
|
||||||
|
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build
|
||||||
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build purge
|
||||||
|
$(info ${GREEN}I llama-cpp build info:cpu-all-variants${RESET})
|
||||||
|
$(MAKE) SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" VARIANT="llama-cpp-cpu-all-build" build-llama-cpp-grpc-server
|
||||||
|
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/grpc-server llama-cpp-cpu-all
|
||||||
|
rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
|
||||||
|
find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
|
||||||
|
@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
|
||||||
|
|
||||||
llama-cpp-grpc: llama.cpp
|
llama-cpp-grpc: llama.cpp
|
||||||
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
|
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
|
||||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
|
||||||
|
|||||||
@@ -14,6 +14,22 @@ mkdir -p $CURDIR/package/lib
|
|||||||
cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
|
cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
|
||||||
cp -rfv $CURDIR/run.sh $CURDIR/package/
|
cp -rfv $CURDIR/run.sh $CURDIR/package/
|
||||||
|
|
||||||
|
# Bundle the ggml shared backends produced by the CPU_ALL_VARIANTS build (libggml-base.so,
|
||||||
|
# libggml.so, libllama.so and the per-microarch libggml-cpu-*.so), all into package/lib.
|
||||||
|
#
|
||||||
|
# Two distinct resolution mechanisms both land here:
|
||||||
|
# - NEEDED deps (libggml-base/libggml/libllama): resolved by the dynamic linker via the
|
||||||
|
# LD_LIBRARY_PATH=$CURDIR/lib that run.sh exports.
|
||||||
|
# - The per-microarch libggml-cpu-*.so are NOT linked; ggml *discovers* them at runtime by
|
||||||
|
# scanning the executable's own directory (readlink /proc/self/exe). run.sh launches via
|
||||||
|
# the bundled $CURDIR/lib/ld.so, so /proc/self/exe -> .../lib/ld.so and ggml scans lib/.
|
||||||
|
# That is why the variants must sit in lib/ (next to ld.so), not just on the link path.
|
||||||
|
# No-op on builds (arm64/darwin) that don't produce the all-variants set.
|
||||||
|
if [ -d "$CURDIR/ggml-shared-libs" ]; then
|
||||||
|
echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
|
||||||
|
cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
|
||||||
|
fi
|
||||||
|
|
||||||
# Detect architecture and copy appropriate libraries
|
# Detect architecture and copy appropriate libraries
|
||||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||||
# x86_64 architecture
|
# x86_64 architecture
|
||||||
|
|||||||
@@ -12,26 +12,12 @@ grep -e "flags" /proc/cpuinfo | head -1
|
|||||||
|
|
||||||
BINARY=llama-cpp-fallback
|
BINARY=llama-cpp-fallback
|
||||||
|
|
||||||
if grep -q -e "\savx\s" /proc/cpuinfo ; then
|
# CPU images (x86, arm64, darwin) ship a single llama-cpp-cpu-all built with ggml
|
||||||
echo "CPU: AVX found OK"
|
# CPU_ALL_VARIANTS: ggml's backend registry dlopens the best libggml-cpu-*.so for this
|
||||||
if [ -e $CURDIR/llama-cpp-avx ]; then
|
# host, so no shell-side AVX probing. GPU images (cublas/sycl/vulkan/hipblas) ship only
|
||||||
BINARY=llama-cpp-avx
|
# llama-cpp-fallback (the accelerator does the compute), so fall back to it when absent.
|
||||||
fi
|
if [ -e $CURDIR/llama-cpp-cpu-all ]; then
|
||||||
fi
|
BINARY=llama-cpp-cpu-all
|
||||||
|
|
||||||
if grep -q -e "\savx2\s" /proc/cpuinfo ; then
|
|
||||||
echo "CPU: AVX2 found OK"
|
|
||||||
if [ -e $CURDIR/llama-cpp-avx2 ]; then
|
|
||||||
BINARY=llama-cpp-avx2
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check avx 512
|
|
||||||
if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
|
|
||||||
echo "CPU: AVX512F found OK"
|
|
||||||
if [ -e $CURDIR/llama-cpp-avx512 ]; then
|
|
||||||
BINARY=llama-cpp-avx512
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
|
if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
|
||||||
|
|||||||
@@ -65,6 +65,29 @@ turboquant-avx:
|
|||||||
turboquant-fallback:
|
turboquant-fallback:
|
||||||
$(call turboquant-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)
|
$(call turboquant-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)
|
||||||
|
|
||||||
|
# Single-build CPU backend via ggml CPU_ALL_VARIANTS (mirrors llama-cpp-cpu-all).
|
||||||
|
# turboquant reuses backend/cpp/llama-cpp's CMakeLists.txt (hw_grpc_proto STATIC) and
|
||||||
|
# Makefile (SHARED_LIBS make-var + EXTRA_CMAKE_ARGS), so this passes the same overrides
|
||||||
|
# through to the copied build: SHARED_LIBS=ON, the DL flags, and --target ggml (which
|
||||||
|
# pulls in the per-microarch libggml-cpu-*.so via ggml's add_dependencies). The .so set
|
||||||
|
# is collected for package.sh to bundle into package/lib.
|
||||||
|
turboquant-cpu-all:
|
||||||
|
rm -rf $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
|
||||||
|
cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
|
||||||
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build purge
|
||||||
|
bash $(CURRENT_MAKEFILE_DIR)/patch-grpc-server.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server.cpp
|
||||||
|
$(info $(GREEN)I turboquant build info:cpu-all-variants$(RESET))
|
||||||
|
LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
|
||||||
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build llama.cpp
|
||||||
|
bash $(CURRENT_MAKEFILE_DIR)/apply-patches.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp $(PATCHES_DIR)
|
||||||
|
SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" \
|
||||||
|
LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
|
||||||
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build grpc-server
|
||||||
|
cp -rfv $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server turboquant-cpu-all
|
||||||
|
rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
|
||||||
|
find $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
|
||||||
|
@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
|
||||||
|
|
||||||
turboquant-grpc:
|
turboquant-grpc:
|
||||||
$(call turboquant-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target rpc-server)
|
$(call turboquant-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target rpc-server)
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,15 @@ mkdir -p $CURDIR/package/lib
|
|||||||
cp -avrf $CURDIR/turboquant-* $CURDIR/package/
|
cp -avrf $CURDIR/turboquant-* $CURDIR/package/
|
||||||
cp -rfv $CURDIR/run.sh $CURDIR/package/
|
cp -rfv $CURDIR/run.sh $CURDIR/package/
|
||||||
|
|
||||||
|
# Bundle the ggml shared backends from the CPU_ALL_VARIANTS build into package/lib. ggml
|
||||||
|
# discovers the per-microarch libggml-cpu-*.so by scanning the executable directory, which
|
||||||
|
# (via the bundled lib/ld.so that run.sh launches through) resolves to lib/. See the
|
||||||
|
# matching comment in backend/cpp/llama-cpp/package.sh. No-op on the fallback/ROCm builds.
|
||||||
|
if [ -d "$CURDIR/ggml-shared-libs" ]; then
|
||||||
|
echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
|
||||||
|
cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
|
||||||
|
fi
|
||||||
|
|
||||||
# Detect architecture and copy appropriate libraries
|
# Detect architecture and copy appropriate libraries
|
||||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||||
# x86_64 architecture
|
# x86_64 architecture
|
||||||
|
|||||||
@@ -12,26 +12,11 @@ grep -e "flags" /proc/cpuinfo | head -1
|
|||||||
|
|
||||||
BINARY=turboquant-fallback
|
BINARY=turboquant-fallback
|
||||||
|
|
||||||
if grep -q -e "\savx\s" /proc/cpuinfo ; then
|
# x86/arm64 ship a single turboquant-cpu-all built with ggml CPU_ALL_VARIANTS: ggml's
|
||||||
echo "CPU: AVX found OK"
|
# backend registry dlopens the best libggml-cpu-*.so for this host, so no shell-side
|
||||||
if [ -e $CURDIR/turboquant-avx ]; then
|
# probing. ROCm ships only turboquant-fallback, so fall back to it when cpu-all is absent.
|
||||||
BINARY=turboquant-avx
|
if [ -e $CURDIR/turboquant-cpu-all ]; then
|
||||||
fi
|
BINARY=turboquant-cpu-all
|
||||||
fi
|
|
||||||
|
|
||||||
if grep -q -e "\savx2\s" /proc/cpuinfo ; then
|
|
||||||
echo "CPU: AVX2 found OK"
|
|
||||||
if [ -e $CURDIR/turboquant-avx2 ]; then
|
|
||||||
BINARY=turboquant-avx2
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check avx 512
|
|
||||||
if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
|
|
||||||
echo "CPU: AVX512F found OK"
|
|
||||||
if [ -e $CURDIR/turboquant-avx512 ]; then
|
|
||||||
BINARY=turboquant-avx512
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
|
if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
|
||||||
|
|||||||
@@ -6,10 +6,11 @@ IMAGE_NAME="${IMAGE_NAME:-localai/llama-cpp-darwin}"
|
|||||||
|
|
||||||
pushd backend/cpp/llama-cpp
|
pushd backend/cpp/llama-cpp
|
||||||
|
|
||||||
# make llama-cpp-avx && \
|
# Single build via ggml CPU_ALL_VARIANTS: one binary plus the per-microarch Apple/arm
|
||||||
# make llama-cpp-avx2 && \
|
# dylibs (apple_m1/m2_m3/m4, armv8.x) that ggml selects at runtime. GGML_METAL stays ON
|
||||||
# make llama-cpp-avx512 && \
|
# and --target ggml also builds ggml-metal (via add_dependencies), so the Metal GPU
|
||||||
make llama-cpp-fallback && \
|
# backend is still produced as a loadable libggml-metal.dylib.
|
||||||
|
make llama-cpp-cpu-all && \
|
||||||
make llama-cpp-grpc && \
|
make llama-cpp-grpc && \
|
||||||
make llama-cpp-rpc-server
|
make llama-cpp-rpc-server
|
||||||
|
|
||||||
@@ -19,13 +20,24 @@ mkdir -p build/darwin
|
|||||||
mkdir -p backend-images
|
mkdir -p backend-images
|
||||||
mkdir -p build/darwin/lib
|
mkdir -p build/darwin/lib
|
||||||
|
|
||||||
# cp -rf backend/cpp/llama-cpp/llama-cpp-avx build/darwin/
|
cp -rf backend/cpp/llama-cpp/llama-cpp-cpu-all build/darwin/
|
||||||
# cp -rf backend/cpp/llama-cpp/llama-cpp-avx2 build/darwin/
|
|
||||||
# cp -rf backend/cpp/llama-cpp/llama-cpp-avx512 build/darwin/
|
|
||||||
cp -rf backend/cpp/llama-cpp/llama-cpp-fallback build/darwin/
|
|
||||||
cp -rf backend/cpp/llama-cpp/llama-cpp-grpc build/darwin/
|
cp -rf backend/cpp/llama-cpp/llama-cpp-grpc build/darwin/
|
||||||
cp -rf backend/cpp/llama-cpp/llama-cpp-rpc-server build/darwin/
|
cp -rf backend/cpp/llama-cpp/llama-cpp-rpc-server build/darwin/
|
||||||
|
|
||||||
|
# Distribute the shared ggml/llama libraries from the CPU_ALL_VARIANTS build. Unlike the
|
||||||
|
# old fully-static fallback build, these have @rpath install names, so the otool loop below
|
||||||
|
# (which only copies deps that exist on disk) will not pick them up. The split is by suffix:
|
||||||
|
# - ggml emits its loadable backends (per-microarch CPU variants, metal, blas) with a .so
|
||||||
|
# suffix EVEN ON DARWIN. These go in the package ROOT next to the binary, because darwin
|
||||||
|
# run.sh execs the binary directly (no bundled ld.so) so ggml's executable-directory
|
||||||
|
# scan looks there.
|
||||||
|
# - the core libraries (libggml-base/libggml/libllama/libllama-common/libmtmd) use the
|
||||||
|
# platform .dylib suffix and are NEEDED deps; they go in lib/, resolved at load time via
|
||||||
|
# the DYLD_LIBRARY_PATH=lib that run.sh exports. -a preserves the version symlinks.
|
||||||
|
SHLIBS=backend/cpp/llama-cpp/ggml-shared-libs
|
||||||
|
cp -a $SHLIBS/*.so build/darwin/
|
||||||
|
cp -a $SHLIBS/*.dylib build/darwin/lib/
|
||||||
|
|
||||||
# Set default additional libs only for Darwin on M chips (arm64)
|
# Set default additional libs only for Darwin on M chips (arm64)
|
||||||
if [[ "$(uname -s)" == "Darwin" && "$(uname -m)" == "arm64" ]]; then
|
if [[ "$(uname -s)" == "Darwin" && "$(uname -m)" == "arm64" ]]; then
|
||||||
ADDITIONAL_LIBS=${ADDITIONAL_LIBS:-$(ls /opt/homebrew/Cellar/protobuf/**/lib/libutf8_validity*.dylib 2>/dev/null)}
|
ADDITIONAL_LIBS=${ADDITIONAL_LIBS:-$(ls /opt/homebrew/Cellar/protobuf/**/lib/libutf8_validity*.dylib 2>/dev/null)}
|
||||||
|
|||||||
Reference in New Issue
Block a user