feat(turboquant.cpp): add new backend

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-04-03 20:53:30 +00:00
parent 8577bdcebc
commit 6e11f882f7
12 changed files with 285 additions and 60 deletions

View File

@@ -1828,6 +1828,98 @@ jobs:
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
# llama-cpp-tq (TurboQuant fork)
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto'
tag-suffix: '-cpu-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp-tq'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-llama-cpp-tq'
base-image: "ubuntu:24.04"
runs-on: 'ubuntu-24.04-arm'
ubuntu-version: '2404'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-rocm-hipblas-llama-cpp-tq'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-24.04:6.4.4"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-arm64-llama-cpp-tq'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2204'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
# Stablediffusion-ggml
- build-type: ''
cuda-major-version: ""

View File

@@ -14,6 +14,11 @@ jobs:
variable: "LLAMA_VERSION"
branch: "master"
file: "backend/cpp/llama-cpp/Makefile"
- repository: "TheTom/llama-cpp-turboquant"
variable: "LLAMA_VERSION"
branch: "master"
file: "backend/cpp/llama-cpp-tq/Makefile"
branch_suffix: "-tq"
- repository: "ggml-org/whisper.cpp"
variable: "WHISPER_CPP_VERSION"
branch: "master"
@@ -60,7 +65,7 @@ jobs:
push-to-fork: ci-forks/LocalAI
commit-message: ':arrow_up: Update ${{ matrix.repository }}'
title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
branch: "update/${{ matrix.variable }}"
branch: "update/${{ matrix.variable }}${{ matrix.branch_suffix }}"
body: ${{ steps.bump.outputs.message }}
signoff: true

1
.gitignore vendored
View File

@@ -9,6 +9,7 @@ prepare-sources
/backend/cpp/llama-cpp/llama.cpp
/backend/cpp/llama-*
!backend/cpp/llama-cpp
!backend/cpp/llama-cpp-tq
/backends
/backend-images
/result.yaml

View File

@@ -544,8 +544,9 @@ backend-images:
mkdir -p backend-images
# Backend metadata: BACKEND_NAME | DOCKERFILE_TYPE | BUILD_CONTEXT | PROGRESS_FLAG | NEEDS_BACKEND_ARG
# llama-cpp is special - uses llama-cpp Dockerfile and doesn't need BACKEND arg
# llama-cpp and forks - use llama-cpp Dockerfile
BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false
BACKEND_LLAMA_CPP_TQ = llama-cpp-tq|llama-cpp|.|false|true
# Golang backends
BACKEND_PIPER = piper|golang|.|false|true
@@ -609,6 +610,7 @@ endef
# Generate all docker-build targets
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_TQ)))
$(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
$(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
$(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))

View File

@@ -58,7 +58,9 @@ ARG CUDA_DOCKER_ARCH
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
ARG CMAKE_ARGS
ENV CMAKE_ARGS=${CMAKE_ARGS}
ARG BACKEND=rerankers
ARG BACKEND=llama-cpp
ARG LLAMA_BACKEND_DIR=${BACKEND}
ENV LLAMA_BACKEND_DIR=${LLAMA_BACKEND_DIR}
ARG BUILD_TYPE
ENV BUILD_TYPE=${BUILD_TYPE}
ARG CUDA_MAJOR_VERSION
@@ -255,32 +257,27 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
rm -rf /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}-*-build
fi
cd /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}
if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
cd /LocalAI/backend/cpp/llama-cpp
make llama-cpp-fallback
make llama-cpp-grpc
make llama-cpp-rpc-server
make ARCH=aarch64 build-variants
else
cd /LocalAI/backend/cpp/llama-cpp
make llama-cpp-avx
make llama-cpp-avx2
make llama-cpp-avx512
make llama-cpp-fallback
make llama-cpp-grpc
make llama-cpp-rpc-server
make build-variants
fi
EOT
# Copy libraries using a script to handle architecture differences
RUN make -BC /LocalAI/backend/cpp/llama-cpp package
RUN make -BC /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR} package
FROM scratch
ARG BACKEND=llama-cpp
ARG LLAMA_BACKEND_DIR=${BACKEND}
# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./
COPY --from=builder /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}/package/. ./

View File

@@ -0,0 +1,6 @@
LLAMA_VERSION?=master
LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant
BACKEND_NAME?=llama-cpp-tq
SHARED_DIR?=$(CURDIR)/../llama-cpp
include ../llama-cpp/Makefile

View File

@@ -59,6 +59,11 @@ add_library(hw_grpc_proto
add_executable(${TARGET} grpc-server.cpp json.hpp httplib.h)
# Enable autoparser support if the header exists (not present in all llama.cpp forks)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/chat-auto-parser.h")
target_compile_definitions(${TARGET} PRIVATE HAS_AUTOPARSER)
endif()
target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})

View File

@@ -1,6 +1,10 @@
LLAMA_VERSION?=a1cfb645307edc61a89e41557f290f441043d3c2
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
BACKEND_NAME?=llama-cpp
SHARED_DIR?=$(CURDIR)
GRPC_SERVER_DIR?=tools/grpc-server
SERVER_SOURCE_DIR?=tools/server
CMAKE_ARGS?=
BUILD_TYPE?=
@@ -67,6 +71,17 @@ ifeq ($(BUILD_TYPE),sycl_f32)
-DCMAKE_CXX_FLAGS="-fsycl"
endif
# Variants to build for each architecture (can be overridden by forks)
X86_64_VARIANTS ?= llama-cpp-avx llama-cpp-avx2 llama-cpp-avx512 llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
ARM64_VARIANTS ?= llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
build-variants:
ifeq ($(ARCH),aarch64)
@for v in $(ARM64_VARIANTS); do $(MAKE) $$v || exit 1; done
else
@for v in $(X86_64_VARIANTS); do $(MAKE) $$v || exit 1; done
endif
INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
@@ -90,42 +105,42 @@ else
endif
llama-cpp-avx2: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build purge
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx2-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build/grpc-server llama-cpp-avx2
llama-cpp-avx512: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build purge
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx512-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build/grpc-server llama-cpp-avx512
llama-cpp-avx: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build purge
$(info ${GREEN}I llama-cpp build info:avx${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-avx-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build/grpc-server llama-cpp-avx
llama-cpp-fallback: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build purge
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-fallback-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build/grpc-server llama-cpp-fallback
llama-cpp-grpc: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build purge
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="$(BACKEND_NAME)-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/grpc-server llama-cpp-grpc
llama-cpp-rpc-server: llama-cpp-grpc
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
llama.cpp:
mkdir -p llama.cpp
@@ -133,30 +148,30 @@ llama.cpp:
git init && \
git remote add origin $(LLAMA_REPO) && \
git fetch origin && \
git checkout -b build $(LLAMA_VERSION) && \
(git checkout -b build $(LLAMA_VERSION) || git checkout -b build origin/$(LLAMA_VERSION)) && \
git submodule update --init --recursive --depth 1 --single-branch
llama.cpp/tools/grpc-server: llama.cpp
mkdir -p llama.cpp/tools/grpc-server
bash prepare.sh
llama.cpp/$(GRPC_SERVER_DIR): llama.cpp
mkdir -p llama.cpp/$(GRPC_SERVER_DIR)
SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
rebuild:
bash prepare.sh
SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
rm -rf grpc-server
$(MAKE) grpc-server
package:
bash package.sh
bash $(SHARED_DIR)/package.sh
purge:
rm -rf llama.cpp/build
rm -rf llama.cpp/tools/grpc-server
rm -rf llama.cpp/$(GRPC_SERVER_DIR)
rm -rf grpc-server
clean: purge
rm -rf llama.cpp
grpc-server: llama.cpp llama.cpp/tools/grpc-server
grpc-server: llama.cpp llama.cpp/$(GRPC_SERVER_DIR)
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \

View File

@@ -17,7 +17,9 @@
#include "backend.pb.h"
#include "backend.grpc.pb.h"
#include "common.h"
#ifdef HAS_AUTOPARSER
#include "chat-auto-parser.h"
#endif
#include <getopt.h>
#include <grpcpp/ext/proto_server_reflection_plugin.h>
#include <grpcpp/grpcpp.h>
@@ -2665,6 +2667,7 @@ public:
response->set_rendered_template(rendered_template);
#ifdef HAS_AUTOPARSER
// Run differential template analysis to detect tool format markers
if (params_base.use_jinja) {
try {
@@ -2770,6 +2773,7 @@ public:
SRV_WRN("ModelMetadata: failed to run autoparser analysis: %s\n", e.what());
}
}
#endif
return grpc::Status::OK;
}

View File

@@ -5,14 +5,21 @@
set -e
CURDIR=$(dirname "$(realpath $0)")
REPO_ROOT="${CURDIR}/../../.."
# Use working directory (not script location) so forks that share this script work correctly
CURDIR=$(pwd)
SCRIPT_DIR=$(dirname "$(realpath $0)")
REPO_ROOT="${SCRIPT_DIR}/../../.."
# Create lib directory
mkdir -p $CURDIR/package/lib
cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
cp -rfv $CURDIR/run.sh $CURDIR/package/
# Copy run.sh — prefer local copy, fall back to shared dir (script location)
if [ -f "$CURDIR/run.sh" ]; then
cp -rfv $CURDIR/run.sh $CURDIR/package/
else
cp -rfv $SCRIPT_DIR/run.sh $CURDIR/package/
fi
# Detect architecture and copy appropriate libraries
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then

View File

@@ -1,31 +1,43 @@
#!/bin/bash
## Patches
SHARED_DIR="${SHARED_DIR:-.}"
SERVER_SOURCE_DIR="${SERVER_SOURCE_DIR:-tools/server}"
GRPC_SERVER_DIR="${GRPC_SERVER_DIR:-tools/grpc-server}"
## Apply patches from the `patches` directory
if [ -d "patches" ]; then
for patch in $(ls patches); do
echo "Applying patch $patch"
patch -d llama.cpp/ -p1 < patches/$patch
done
done
fi
set -e
for file in $(ls llama.cpp/tools/server/); do
cp -rfv llama.cpp/tools/server/$file llama.cpp/tools/grpc-server/
# Copy server source files into grpc-server build directory
for file in $(ls llama.cpp/${SERVER_SOURCE_DIR}/); do
cp -rfv llama.cpp/${SERVER_SOURCE_DIR}/$file llama.cpp/${GRPC_SERVER_DIR}/
done
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/
# Copy build files — prefer local overrides, fall back to SHARED_DIR
for f in CMakeLists.txt grpc-server.cpp; do
if [ -f "$f" ]; then
cp -r "$f" llama.cpp/${GRPC_SERVER_DIR}/
else
cp -r "$SHARED_DIR/$f" llama.cpp/${GRPC_SERVER_DIR}/
fi
done
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/${GRPC_SERVER_DIR}/
cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/${GRPC_SERVER_DIR}/
# Add grpc-server subdirectory to the parent CMakeLists.txt
PARENT_CMAKELISTS="llama.cpp/$(dirname ${GRPC_SERVER_DIR})/CMakeLists.txt"
set +e
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
if grep -q "grpc-server" "$PARENT_CMAKELISTS"; then
echo "grpc-server already added"
else
echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
echo "add_subdirectory(grpc-server)" >> "$PARENT_CMAKELISTS"
fi
set -e

View File

@@ -29,6 +29,34 @@
nvidia-cuda-12: "cuda12-llama-cpp"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp"
- &llamacpp_tq
name: "llama-cpp-tq"
alias: "llama-cpp-tq"
license: mit
description: |
TurboQuant llama.cpp fork - quantization research
urls:
- https://github.com/TheTom/llama-cpp-turboquant
tags:
- text-to-text
- LLM
- CPU
- GPU
- Metal
- CUDA
- HIP
capabilities:
default: "cpu-llama-cpp-tq"
nvidia: "cuda12-llama-cpp-tq"
intel: "intel-sycl-f16-llama-cpp-tq"
amd: "rocm-llama-cpp-tq"
metal: "metal-llama-cpp-tq"
vulkan: "vulkan-llama-cpp-tq"
nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-tq"
nvidia-cuda-13: "cuda13-llama-cpp-tq"
nvidia-cuda-12: "cuda12-llama-cpp-tq"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-tq"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
- &whispercpp
name: "whisper"
alias: "whisper"
@@ -1252,6 +1280,57 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp
# llama-cpp-tq (TurboQuant) concrete backends
- !!merge <<: *llamacpp_tq
name: "cpu-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-cpu-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda12-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda13-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "rocm-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "intel-sycl-f16-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "intel-sycl-f32-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "vulkan-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-vulkan-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "metal-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "nvidia-l4t-arm64-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq
## whisper
- !!merge <<: *whispercpp
name: "nvidia-l4t-arm64-whisper"