mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-26 09:26:55 -04:00
New backend = stock llama-cpp grpc-server + the paged patchset (forces LLAMA_PAGED=on), shipped as its own meta-backend (mirrors turboquant, simpler: no fork pin, no grpc-server patching - the paged runtime hooks already exist in grpc-server.cpp). Stock llama-cpp untouched (LLAMA_PAGED?=on retained; the de-risk flip deferred for sign-off). Gallery: qwen3.6-27b-nvfp4 (dense) + qwen3.6-35b-a3b-nvfp4 (MoE) with the benchmark run config (paged_kv, max_batch_tokens, parallel, flash_attention, f16), mudler/ GGUF uris (sha256 TODO until publish). Importer dropdown entry + tests. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
100 lines
5.4 KiB
Makefile
100 lines
5.4 KiB
Makefile
|
|
# llama-cpp-localai-paged is LocalAI's paged-attention llama.cpp variant. It is
|
|
# the SAME upstream llama.cpp pin as the stock llama-cpp backend, with the
|
|
# LocalAI paged-attention patch series (backend/cpp/llama-cpp/patches/paged/)
|
|
# applied on top (LLAMA_PAGED=on). It reuses backend/cpp/llama-cpp's
|
|
# grpc-server.cpp / CMakeLists.txt / prepare.sh sources verbatim via a thin
|
|
# wrapper, so there is nothing to keep in sync here.
|
|
#
|
|
# Differences vs the turboquant wrapper (the precedent this is modelled on):
|
|
# - NO LLAMA_REPO / LLAMA_VERSION override: we build the SAME upstream pin as
|
|
# stock llama-cpp (it lives in backend/cpp/llama-cpp/Makefile and is
|
|
# auto-bumped there), so there is no bump_deps.yaml entry to maintain.
|
|
# - NO patch-grpc-server.sh and NO apply-patches.sh: the shared
|
|
# grpc-server.cpp already carries the (runtime-gated) paged option hooks,
|
|
# and the paged patch series is applied by the copied llama-cpp Makefile's
|
|
# own `llama.cpp` target whenever LLAMA_PAGED=on (which we force below).
|
|
|
|
CMAKE_ARGS?=
|
|
BUILD_TYPE?=
|
|
NATIVE?=false
|
|
ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
|
TARGET?=--target grpc-server
|
|
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
|
|
ARCH?=$(shell uname -m)
|
|
|
|
CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
|
LLAMA_CPP_DIR := $(CURRENT_MAKEFILE_DIR)/../llama-cpp
|
|
|
|
GREEN := \033[0;32m
|
|
RESET := \033[0m
|
|
|
|
# Each flavor target:
|
|
# 1. copies backend/cpp/llama-cpp/ (grpc-server.cpp + prepare.sh +
|
|
# CMakeLists.txt + Makefile) into a sibling
|
|
# llama-cpp-localai-paged-<flavor>-build directory;
|
|
# 2. clones the SAME upstream llama.cpp pin into that copy and applies the
|
|
# base AND paged patch series via the copy's own `llama.cpp` target with
|
|
# LLAMA_PAGED=on;
|
|
# 3. runs the copy's `grpc-server` target (LLAMA_PAGED=on) and copies the
|
|
# produced binary up as llama-cpp-localai-paged-<flavor>.
|
|
# We patch only the *copy*, never the original under backend/cpp/llama-cpp/, so
|
|
# the stock llama-cpp build stays untouched.
|
|
define paged-build
|
|
rm -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build
|
|
cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build
|
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build purge
|
|
$(info $(GREEN)I llama-cpp-localai-paged build info:$(1)$(RESET))
|
|
LLAMA_PAGED=on $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build llama.cpp
|
|
CMAKE_ARGS="$(CMAKE_ARGS) $(2)" TARGET="$(3)" LLAMA_PAGED=on \
|
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build grpc-server
|
|
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build/grpc-server llama-cpp-localai-paged-$(1)
|
|
endef
|
|
|
|
llama-cpp-localai-paged-avx2:
|
|
$(call paged-build,avx2,-DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on,--target grpc-server)
|
|
|
|
llama-cpp-localai-paged-avx512:
|
|
$(call paged-build,avx512,-DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on,--target grpc-server)
|
|
|
|
llama-cpp-localai-paged-avx:
|
|
$(call paged-build,avx,-DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)
|
|
|
|
llama-cpp-localai-paged-fallback:
|
|
$(call paged-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)
|
|
|
|
# Single-build CPU backend via ggml CPU_ALL_VARIANTS (mirrors llama-cpp-cpu-all).
|
|
# Reuses backend/cpp/llama-cpp's CMakeLists.txt (hw_grpc_proto STATIC) and
|
|
# Makefile (SHARED_LIBS make-var + EXTRA_CMAKE_ARGS), so this passes the same
|
|
# overrides through to the copied build: SHARED_LIBS=ON, the DL flags, and
|
|
# --target ggml (which pulls in the per-microarch libggml-cpu-*.so via ggml's
|
|
# add_dependencies). The .so set is collected for package.sh to bundle into
|
|
# package/lib.
|
|
llama-cpp-localai-paged-cpu-all:
|
|
rm -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build
|
|
cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build
|
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build purge
|
|
$(info $(GREEN)I llama-cpp-localai-paged build info:cpu-all-variants$(RESET))
|
|
LLAMA_PAGED=on $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build llama.cpp
|
|
SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" LLAMA_PAGED=on \
|
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build grpc-server
|
|
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build/grpc-server llama-cpp-localai-paged-cpu-all
|
|
rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
|
|
find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
|
|
@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
|
|
|
|
llama-cpp-localai-paged-grpc:
|
|
$(call paged-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target rpc-server)
|
|
|
|
llama-cpp-localai-paged-rpc-server: llama-cpp-localai-paged-grpc
|
|
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-localai-paged-rpc-server
|
|
|
|
package:
|
|
bash package.sh
|
|
|
|
purge:
|
|
rm -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-*-build
|
|
rm -rf llama-cpp-localai-paged-* package
|
|
|
|
clean: purge
|