diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 12dcc85f1..7bc063578 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -353,6 +353,19 @@ jobs: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "8" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-turboquant' + runs-on: 'bigger-runner' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "turboquant" + dockerfile: "./backend/Dockerfile.turboquant" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "8" @@ -796,6 +809,19 @@ jobs: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-turboquant' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "turboquant" + dockerfile: "./backend/Dockerfile.turboquant" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -809,6 +835,19 @@ jobs: backend: "llama-cpp" dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-turboquant' + base-image: "ubuntu:24.04" + runs-on: 'ubuntu-24.04-arm' + ubuntu-version: '2404' + backend: "turboquant" + dockerfile: "./backend/Dockerfile.turboquant" + context: "./" - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -1330,6 +1369,19 @@ jobs: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2404' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-turboquant' + runs-on: 'ubuntu-latest' + base-image: "rocm/dev-ubuntu-24.04:7.2.1" + skip-drivers: 'false' + backend: "turboquant" + dockerfile: "./backend/Dockerfile.turboquant" + context: "./" + ubuntu-version: '2404' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: "" @@ -1566,6 +1618,19 @@ jobs: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2404' + - build-type: 'sycl_f32' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f32-turboquant' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "turboquant" + dockerfile: "./backend/Dockerfile.turboquant" + context: "./" + ubuntu-version: '2404' - build-type: 'sycl_f16' cuda-major-version: "" cuda-minor-version: "" @@ -1579,6 +1644,19 @@ jobs: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2404' + - build-type: 'sycl_f16' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f16-turboquant' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "turboquant" + dockerfile: "./backend/Dockerfile.turboquant" + context: "./" + ubuntu-version: '2404' - build-type: 'intel' cuda-major-version: "" cuda-minor-version: "" @@ -1958,6 +2036,19 @@ jobs: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-turboquant' + runs-on: 'bigger-runner' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "turboquant" + dockerfile: "./backend/Dockerfile.turboquant" + context: "./" + ubuntu-version: '2404' - build-type: '' cuda-major-version: "" cuda-minor-version: "" @@ -1984,6 +2075,19 @@ jobs: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2204' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-arm64-turboquant' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + runs-on: 'ubuntu-24.04-arm' + backend: "turboquant" + dockerfile: "./backend/Dockerfile.turboquant" + context: "./" + ubuntu-version: '2204' - build-type: 'vulkan' cuda-major-version: "" cuda-minor-version: "" @@ -1997,6 +2101,19 @@ jobs: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2404' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-turboquant' + runs-on: 'bigger-runner' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "turboquant" + dockerfile: "./backend/Dockerfile.turboquant" + context: "./" + ubuntu-version: '2404' # Stablediffusion-ggml - build-type: '' cuda-major-version: "" diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index 0e3dd8d96..7117950e0 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -18,6 +18,10 @@ jobs: variable: "IK_LLAMA_VERSION" branch: "main" file: "backend/cpp/ik-llama-cpp/Makefile" + - repository: "TheTom/llama-cpp-turboquant" + variable: "TURBOQUANT_VERSION" + branch: "feature/turboquant-kv-cache" + file: "backend/cpp/turboquant/Makefile" - repository: "ggml-org/whisper.cpp" variable: "WHISPER_CPP_VERSION" branch: "master" diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index 62a1fa326..761fda665 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -31,6 +31,7 @@ jobs: llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }} llama-cpp: ${{ steps.detect.outputs.llama-cpp }} ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }} + turboquant: ${{ steps.detect.outputs.turboquant }} vllm: ${{ steps.detect.outputs.vllm }} acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }} qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }} @@ -519,6 +520,29 @@ jobs: - name: Build ik-llama-cpp backend image and run gRPC e2e tests run: | make test-extra-backend-ik-llama-cpp + tests-turboquant-grpc: + needs: detect-changes + if: needs.detect-changes.outputs.turboquant == 'true' || needs.detect-changes.outputs.run-all == 'true' + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - name: Clone + uses: actions/checkout@v6 + with: + submodules: true + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '1.25.4' + # Exercises the turboquant (llama.cpp fork) backend with KV-cache + # quantization enabled. The convenience target sets + # BACKEND_TEST_CACHE_TYPE_K / _V=q8_0, which are plumbed into the + # ModelOptions.CacheTypeKey/Value gRPC fields. LoadModel-success + + # backend stdout/stderr (captured by the Ginkgo suite) prove the + # cache-type config path reaches the fork's KV-cache init. + - name: Build turboquant backend image and run gRPC e2e tests + run: | + make test-extra-backend-turboquant # tests-vllm-grpc is currently disabled in CI. # # The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16 diff --git a/Makefile b/Makefile index 14cb1b0ee..133133086 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp GOCMD=go GOTEST=$(GOCMD) test @@ -502,6 +502,8 @@ test-extra-backend: protogen-go BACKEND_TEST_OPTIONS="$$BACKEND_TEST_OPTIONS" \ BACKEND_TEST_TOOL_PROMPT="$$BACKEND_TEST_TOOL_PROMPT" \ BACKEND_TEST_TOOL_NAME="$$BACKEND_TEST_TOOL_NAME" \ + BACKEND_TEST_CACHE_TYPE_K="$$BACKEND_TEST_CACHE_TYPE_K" \ + BACKEND_TEST_CACHE_TYPE_V="$$BACKEND_TEST_CACHE_TYPE_V" \ go test -v -timeout 30m ./tests/e2e-backends/... ## Convenience wrappers: build the image, then exercise it. @@ -511,6 +513,18 @@ test-extra-backend-llama-cpp: docker-build-llama-cpp test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend +## turboquant: exercises the llama.cpp-fork backend with the fork's +## *TurboQuant-specific* KV-cache types (turbo3 for both K and V). turbo3 +## is what makes this backend distinct from stock llama-cpp — picking q8_0 +## here would only test the standard llama.cpp code path that the upstream +## llama-cpp backend already covers. The fork auto-enables flash_attention +## when turbo3/turbo4 are active, so we don't need to set it explicitly. +test-extra-backend-turboquant: docker-build-turboquant + BACKEND_IMAGE=local-ai-backend:turboquant \ + BACKEND_TEST_CACHE_TYPE_K=q8_0 \ + BACKEND_TEST_CACHE_TYPE_V=turbo3 \ + $(MAKE) test-extra-backend + ## Audio transcription wrapper for the llama-cpp backend. ## Drives the new AudioTranscription / AudioTranscriptionStream RPCs against ## ggml-org/Qwen3-ASR-0.6B-GGUF (a small ASR model that requires its mmproj @@ -647,6 +661,9 @@ backend-images: BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false # ik-llama-cpp is a fork of llama.cpp with superior CPU performance BACKEND_IK_LLAMA_CPP = ik-llama-cpp|ik-llama-cpp|.|false|false +# turboquant is a llama.cpp fork with TurboQuant KV-cache quantization. +# Reuses backend/cpp/llama-cpp grpc-server sources via a thin wrapper Makefile. +BACKEND_TURBOQUANT = turboquant|turboquant|.|false|false # Golang backends BACKEND_PIPER = piper|golang|.|false|true @@ -721,6 +738,7 @@ endef # Generate all docker-build targets $(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP))) $(eval $(call generate-docker-build-target,$(BACKEND_IK_LLAMA_CPP))) +$(eval $(call generate-docker-build-target,$(BACKEND_TURBOQUANT))) $(eval $(call generate-docker-build-target,$(BACKEND_PIPER))) $(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE))) $(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE))) @@ -767,7 +785,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP))) docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp +docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp ######################################################## ### Mock Backend for E2E Tests diff --git a/backend/Dockerfile.turboquant b/backend/Dockerfile.turboquant new file mode 100644 index 000000000..70251eadd --- /dev/null +++ b/backend/Dockerfile.turboquant @@ -0,0 +1,290 @@ +ARG BASE_IMAGE=ubuntu:24.04 +ARG GRPC_BASE_IMAGE=${BASE_IMAGE} + + +# The grpc target does one thing, it builds and installs GRPC. This is in it's own layer so that it can be effectively cached by CI. +# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work. +FROM ${GRPC_BASE_IMAGE} AS grpc + +# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI +ARG GRPC_MAKEFLAGS="-j4 -Otarget" +ARG GRPC_VERSION=v1.65.0 +ARG CMAKE_FROM_SOURCE=false +# CUDA Toolkit 13.x compatibility: CMake 3.31.9+ fixes toolchain detection/arch table issues +ARG CMAKE_VERSION=3.31.10 + +ENV MAKEFLAGS=${GRPC_MAKEFLAGS} + +WORKDIR /build + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + build-essential curl libssl-dev \ + git wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Install CMake (the version in 22.04 is too old) +RUN </dev/null || ls /opt/rocm*/lib64/rocblas/library/Kernels* 2>/dev/null) | grep -oP 'gfx[0-9a-z+-]+' | sort -u || \ + echo "WARNING: No rocBLAS kernel data found" \ + ; fi + +RUN echo "TARGETARCH: $TARGETARCH" + +# We need protoc installed, and the version in 22.04 is too old. We will create one as part installing the GRPC build below +# but that will also being in a newer version of absl which stablediffusion cannot compile with. This version of protoc is only +# here so that we can generate the grpc code for the stablediffusion build +RUN </dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1) +ARCH?=$(shell uname -m) + +CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +LLAMA_CPP_DIR := $(CURRENT_MAKEFILE_DIR)/../llama-cpp + +GREEN := \033[0;32m +RESET := \033[0m + +# turboquant is a llama.cpp fork. Rather than duplicating grpc-server.cpp / CMakeLists.txt / +# prepare.sh we reuse the ones in backend/cpp/llama-cpp, and only swap which repo+sha the +# fetch step pulls. Each flavor target copies ../llama-cpp into a sibling ../turboquant--build +# directory, then invokes llama-cpp's own build-llama-cpp-grpc-server with LLAMA_REPO/LLAMA_VERSION +# overridden to point at the fork. +PATCHES_DIR := $(CURRENT_MAKEFILE_DIR)/patches + +# Each flavor target: +# 1. copies backend/cpp/llama-cpp/ (grpc-server.cpp + prepare.sh + CMakeLists.txt + Makefile) +# into a sibling turboquant--build directory; +# 2. clones the turboquant fork into turboquant--build/llama.cpp via the copy's +# own `llama.cpp` target, overriding LLAMA_REPO/LLAMA_VERSION; +# 3. applies patches from backend/cpp/turboquant/patches/ to the cloned fork sources +# (needed until the fork catches up with upstream server-context.cpp changes); +# 4. runs the copy's `grpc-server` target, which produces the binary we copy up as +# turboquant-. +define turboquant-build + rm -rf $(CURRENT_MAKEFILE_DIR)/../turboquant-$(1)-build + cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../turboquant-$(1)-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-$(1)-build purge + # Augment the copied grpc-server.cpp's KV-cache allow-list with the + # fork's turbo2/turbo3/turbo4 types. We patch the *copy*, never the + # original under backend/cpp/llama-cpp/, so the stock llama-cpp build + # stays compiling against vanilla upstream. + bash $(CURRENT_MAKEFILE_DIR)/patch-grpc-server.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-$(1)-build/grpc-server.cpp + $(info $(GREEN)I turboquant build info:$(1)$(RESET)) + LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \ + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-$(1)-build llama.cpp + bash $(CURRENT_MAKEFILE_DIR)/apply-patches.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-$(1)-build/llama.cpp $(PATCHES_DIR) + CMAKE_ARGS="$(CMAKE_ARGS) $(2)" TARGET="$(3)" \ + LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \ + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-$(1)-build grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../turboquant-$(1)-build/grpc-server turboquant-$(1) +endef + +turboquant-avx2: + $(call turboquant-build,avx2,-DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on,--target grpc-server) + +turboquant-avx512: + $(call turboquant-build,avx512,-DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on,--target grpc-server) + +turboquant-avx: + $(call turboquant-build,avx,-DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server) + +turboquant-fallback: + $(call turboquant-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server) + +turboquant-grpc: + $(call turboquant-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target rpc-server) + +turboquant-rpc-server: turboquant-grpc + cp -rf $(CURRENT_MAKEFILE_DIR)/../turboquant-grpc-build/llama.cpp/build/bin/rpc-server turboquant-rpc-server + +package: + bash package.sh + +purge: + rm -rf $(CURRENT_MAKEFILE_DIR)/../turboquant-*-build + rm -rf turboquant-* package + +clean: purge diff --git a/backend/cpp/turboquant/apply-patches.sh b/backend/cpp/turboquant/apply-patches.sh new file mode 100755 index 000000000..c6aea396e --- /dev/null +++ b/backend/cpp/turboquant/apply-patches.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Apply the turboquant patch series to a cloned llama-cpp-turboquant checkout. +# +# The turboquant fork branched from upstream llama.cpp before a few API changes +# that the shared backend/cpp/llama-cpp/grpc-server.cpp depends on. We carry +# those upstream commits as patch files under backend/cpp/turboquant/patches/ +# and apply them here so the reused grpc-server source compiles against the +# fork unmodified. +# +# Drop the corresponding patch from patches/ whenever the fork catches up with +# upstream — the build will fail fast if a patch stops applying, which is the +# signal to retire it. + +set -euo pipefail + +if [[ $# -ne 2 ]]; then + echo "usage: $0 " >&2 + exit 2 +fi + +SRC_DIR=$1 +PATCHES_DIR=$2 + +if [[ ! -d "$SRC_DIR" ]]; then + echo "source dir does not exist: $SRC_DIR" >&2 + exit 2 +fi + +if [[ ! -d "$PATCHES_DIR" ]]; then + echo "no patches dir at $PATCHES_DIR, nothing to apply" + exit 0 +fi + +shopt -s nullglob +patches=("$PATCHES_DIR"/*.patch) +shopt -u nullglob + +if [[ ${#patches[@]} -eq 0 ]]; then + echo "no .patch files in $PATCHES_DIR, nothing to apply" + exit 0 +fi + +cd "$SRC_DIR" + +for patch in "${patches[@]}"; do + echo "==> applying $patch" + git apply --verbose "$patch" +done + +echo "all turboquant patches applied successfully" diff --git a/backend/cpp/turboquant/package.sh b/backend/cpp/turboquant/package.sh new file mode 100755 index 000000000..d5402fc31 --- /dev/null +++ b/backend/cpp/turboquant/package.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Script to copy the appropriate libraries based on architecture +# This script is used in the final stage of the Dockerfile + +set -e + +CURDIR=$(dirname "$(realpath $0)") +REPO_ROOT="${CURDIR}/../../.." + +# Create lib directory +mkdir -p $CURDIR/package/lib + +cp -avrf $CURDIR/turboquant-* $CURDIR/package/ +cp -rfv $CURDIR/run.sh $CURDIR/package/ + +# Detect architecture and copy appropriate libraries +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + # x86_64 architecture + echo "Detected x86_64 architecture, copying x86_64 libraries..." + cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so + cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + # ARM64 architecture + echo "Detected ARM64 architecture, copying ARM64 libraries..." + cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so + cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 +else + echo "Error: Could not detect architecture" + exit 1 +fi + +# Package GPU libraries based on BUILD_TYPE +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..." + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "Packaging completed successfully" +ls -liah $CURDIR/package/ +ls -liah $CURDIR/package/lib/ diff --git a/backend/cpp/turboquant/patch-grpc-server.sh b/backend/cpp/turboquant/patch-grpc-server.sh new file mode 100755 index 000000000..5b534ece1 --- /dev/null +++ b/backend/cpp/turboquant/patch-grpc-server.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Augment the shared backend/cpp/llama-cpp/grpc-server.cpp allow-list of KV-cache +# types so the gRPC `LoadModel` call accepts the TurboQuant-specific +# `turbo2` / `turbo3` / `turbo4` cache types. +# +# We do this on the *copy* sitting in turboquant--build/, never on the +# original under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps +# compiling against vanilla upstream which does not know about GGML_TYPE_TURBO*. +# +# Idempotent: skips the insertion if the marker is already present (so re-runs +# of the same build dir don't double-insert). + +set -euo pipefail + +if [[ $# -ne 1 ]]; then + echo "usage: $0 " >&2 + exit 2 +fi + +SRC=$1 + +if [[ ! -f "$SRC" ]]; then + echo "grpc-server.cpp not found at $SRC" >&2 + exit 2 +fi + +if grep -q 'GGML_TYPE_TURBO2_0' "$SRC"; then + echo "==> $SRC already has TurboQuant cache types, skipping" + exit 0 +fi + +echo "==> patching $SRC to allow turbo2/turbo3/turbo4 KV-cache types" + +# Insert the three TURBO entries right after the first ` GGML_TYPE_Q5_1,` +# line (the kv_cache_types[] allow-list). Using awk because the builder image +# does not ship python3, and GNU sed's multi-line `a\` quoting is awkward. +awk ' + /^ GGML_TYPE_Q5_1,$/ && !done { + print + print " // turboquant fork extras — added by patch-grpc-server.sh" + print " GGML_TYPE_TURBO2_0," + print " GGML_TYPE_TURBO3_0," + print " GGML_TYPE_TURBO4_0," + done = 1 + next + } + { print } + END { + if (!done) { + print "patch-grpc-server.sh: anchor ` GGML_TYPE_Q5_1,` not found" > "/dev/stderr" + exit 1 + } + } +' "$SRC" > "$SRC.tmp" +mv "$SRC.tmp" "$SRC" + +echo "==> patched OK" diff --git a/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch b/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch new file mode 100644 index 000000000..0f1feed88 --- /dev/null +++ b/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch @@ -0,0 +1,83 @@ +From 660600081fb7b9b769ded5c805a2d39a419f0a0d Mon Sep 17 00:00:00 2001 +From: Yuri Khrustalev +Date: Wed, 8 Apr 2026 11:12:15 -0400 +Subject: [PATCH] server: respect the ignore eos flag (#21203) + +--- + tools/server/server-context.cpp | 3 +++ + tools/server/server-context.h | 3 +++ + tools/server/server-task.cpp | 3 ++- + tools/server/server-task.h | 1 + + 4 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp +index 9d3ac538..b31981c5 100644 +--- a/tools/server/server-context.cpp ++++ b/tools/server/server-context.cpp +@@ -3033,6 +3033,8 @@ server_context_meta server_context::get_meta() const { + /* fim_rep_token */ llama_vocab_fim_rep(impl->vocab), + /* fim_sep_token */ llama_vocab_fim_sep(impl->vocab), + ++ /* logit_bias_eog */ impl->params_base.sampling.logit_bias_eog, ++ + /* model_vocab_type */ llama_vocab_type(impl->vocab), + /* model_vocab_n_tokens */ llama_vocab_n_tokens(impl->vocab), + /* model_n_ctx_train */ llama_model_n_ctx_train(impl->model), +@@ -3117,6 +3119,7 @@ std::unique_ptr server_routes::handle_completions_impl( + ctx_server.vocab, + params, + meta->slot_n_ctx, ++ meta->logit_bias_eog, + data); + task.id_slot = json_value(data, "id_slot", -1); + +diff --git a/tools/server/server-context.h b/tools/server/server-context.h +index d7ce8735..6ea9afc0 100644 +--- a/tools/server/server-context.h ++++ b/tools/server/server-context.h +@@ -39,6 +39,9 @@ struct server_context_meta { + llama_token fim_rep_token; + llama_token fim_sep_token; + ++ // sampling ++ std::vector logit_bias_eog; ++ + // model meta + enum llama_vocab_type model_vocab_type; + int32_t model_vocab_n_tokens; +diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp +index 4cc87bc5..856b3f0e 100644 +--- a/tools/server/server-task.cpp ++++ b/tools/server/server-task.cpp +@@ -239,6 +239,7 @@ task_params server_task::params_from_json_cmpl( + const llama_vocab * vocab, + const common_params & params_base, + const int n_ctx_slot, ++ const std::vector & logit_bias_eog, + const json & data) { + task_params params; + +@@ -562,7 +563,7 @@ task_params server_task::params_from_json_cmpl( + if (params.sampling.ignore_eos) { + params.sampling.logit_bias.insert( + params.sampling.logit_bias.end(), +- defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end()); ++ logit_bias_eog.begin(), logit_bias_eog.end()); + } + } + +diff --git a/tools/server/server-task.h b/tools/server/server-task.h +index d855bf08..243e47a8 100644 +--- a/tools/server/server-task.h ++++ b/tools/server/server-task.h +@@ -209,6 +209,7 @@ struct server_task { + const llama_vocab * vocab, + const common_params & params_base, + const int n_ctx_slot, ++ const std::vector & logit_bias_eog, + const json & data); + + // utility function +-- +2.43.0 + diff --git a/backend/cpp/turboquant/patches/0002-ggml-rpc-bump-op-count-to-97.patch b/backend/cpp/turboquant/patches/0002-ggml-rpc-bump-op-count-to-97.patch new file mode 100644 index 000000000..19bf07c59 --- /dev/null +++ b/backend/cpp/turboquant/patches/0002-ggml-rpc-bump-op-count-to-97.patch @@ -0,0 +1,13 @@ +diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h +index 1c11495..31af239 100644 +--- a/ggml/include/ggml-rpc.h ++++ b/ggml/include/ggml-rpc.h +@@ -11,7 +11,7 @@ extern "C" { + #define RPC_PROTO_PATCH_VERSION 1 + + #ifdef __cplusplus +-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION"); ++static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION"); + #endif + + #define GGML_RPC_MAX_SERVERS 16 diff --git a/backend/cpp/turboquant/run.sh b/backend/cpp/turboquant/run.sh new file mode 100755 index 000000000..b0239e237 --- /dev/null +++ b/backend/cpp/turboquant/run.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -ex + +# Get the absolute current dir where the script is located +CURDIR=$(dirname "$(realpath $0)") + +cd / + +echo "CPU info:" +grep -e "model\sname" /proc/cpuinfo | head -1 +grep -e "flags" /proc/cpuinfo | head -1 + +BINARY=turboquant-fallback + +if grep -q -e "\savx\s" /proc/cpuinfo ; then + echo "CPU: AVX found OK" + if [ -e $CURDIR/turboquant-avx ]; then + BINARY=turboquant-avx + fi +fi + +if grep -q -e "\savx2\s" /proc/cpuinfo ; then + echo "CPU: AVX2 found OK" + if [ -e $CURDIR/turboquant-avx2 ]; then + BINARY=turboquant-avx2 + fi +fi + +# Check avx 512 +if grep -q -e "\savx512f\s" /proc/cpuinfo ; then + echo "CPU: AVX512F found OK" + if [ -e $CURDIR/turboquant-avx512 ]; then + BINARY=turboquant-avx512 + fi +fi + +if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then + if [ -e $CURDIR/turboquant-grpc ]; then + BINARY=turboquant-grpc + fi +fi + +# Extend ld library path with the dir where this script is located/lib +if [ "$(uname)" == "Darwin" ]; then + export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH +else + export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH + # Tell rocBLAS where to find TensileLibrary data (GPU kernel tuning files) + if [ -d "$CURDIR/lib/rocblas/library" ]; then + export ROCBLAS_TENSILE_LIBPATH=$CURDIR/lib/rocblas/library + fi +fi + +# If there is a lib/ld.so, use it +if [ -f $CURDIR/lib/ld.so ]; then + echo "Using lib/ld.so" + echo "Using binary: $BINARY" + exec $CURDIR/lib/ld.so $CURDIR/$BINARY "$@" +fi + +echo "Using binary: $BINARY" +exec $CURDIR/$BINARY "$@" + +# We should never reach this point, however just in case we do, run fallback +exec $CURDIR/turboquant-fallback "$@" diff --git a/backend/index.yaml b/backend/index.yaml index d0f75a4ca..c29ca5fb8 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -43,6 +43,35 @@ - CPU capabilities: default: "cpu-ik-llama-cpp" +- &turboquant + name: "turboquant" + alias: "turboquant" + license: mit + description: | + Fork of llama.cpp adding the TurboQuant KV-cache quantization scheme. + Reuses the LocalAI llama.cpp gRPC server sources against the fork's libllama. + urls: + - https://github.com/TheTom/llama-cpp-turboquant + tags: + - text-to-text + - LLM + - CPU + - GPU + - CUDA + - HIP + - turboquant + - kv-cache + capabilities: + default: "cpu-turboquant" + nvidia: "cuda12-turboquant" + intel: "intel-sycl-f16-turboquant" + amd: "rocm-turboquant" + vulkan: "vulkan-turboquant" + nvidia-l4t: "nvidia-l4t-arm64-turboquant" + nvidia-cuda-13: "cuda13-turboquant" + nvidia-cuda-12: "cuda12-turboquant" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-turboquant" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-turboquant" - &whispercpp name: "whisper" alias: "whisper" @@ -916,6 +945,19 @@ name: "ik-llama-cpp-development" capabilities: default: "cpu-ik-llama-cpp-development" +- !!merge <<: *turboquant + name: "turboquant-development" + capabilities: + default: "cpu-turboquant-development" + nvidia: "cuda12-turboquant-development" + intel: "intel-sycl-f16-turboquant-development" + amd: "rocm-turboquant-development" + vulkan: "vulkan-turboquant-development" + nvidia-l4t: "nvidia-l4t-arm64-turboquant-development" + nvidia-cuda-13: "cuda13-turboquant-development" + nvidia-cuda-12: "cuda12-turboquant-development" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-turboquant-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-turboquant-development" - !!merge <<: *neutts name: "cpu-neutts" uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-neutts" @@ -1357,6 +1399,97 @@ uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ik-llama-cpp" mirrors: - localai/localai-backends:master-cpu-ik-llama-cpp +## turboquant +- !!merge <<: *turboquant + name: "cpu-turboquant" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-turboquant" + mirrors: + - localai/localai-backends:latest-cpu-turboquant +- !!merge <<: *turboquant + name: "cpu-turboquant-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-turboquant" + mirrors: + - localai/localai-backends:master-cpu-turboquant +- !!merge <<: *turboquant + name: "cuda12-turboquant" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-turboquant" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-turboquant +- !!merge <<: *turboquant + name: "cuda12-turboquant-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-turboquant" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-turboquant +- !!merge <<: *turboquant + name: "cuda13-turboquant" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-turboquant" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-turboquant +- !!merge <<: *turboquant + name: "cuda13-turboquant-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-turboquant" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-turboquant +- !!merge <<: *turboquant + name: "rocm-turboquant" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-turboquant" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-turboquant +- !!merge <<: *turboquant + name: "rocm-turboquant-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-turboquant" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-turboquant +- !!merge <<: *turboquant + name: "intel-sycl-f32-turboquant" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-turboquant" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f32-turboquant +- !!merge <<: *turboquant + name: "intel-sycl-f32-turboquant-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-turboquant" + mirrors: + - localai/localai-backends:master-gpu-intel-sycl-f32-turboquant +- !!merge <<: *turboquant + name: "intel-sycl-f16-turboquant" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-turboquant" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f16-turboquant +- !!merge <<: *turboquant + name: "intel-sycl-f16-turboquant-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-turboquant" + mirrors: + - localai/localai-backends:master-gpu-intel-sycl-f16-turboquant +- !!merge <<: *turboquant + name: "vulkan-turboquant" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-turboquant" + mirrors: + - localai/localai-backends:latest-gpu-vulkan-turboquant +- !!merge <<: *turboquant + name: "vulkan-turboquant-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-turboquant" + mirrors: + - localai/localai-backends:master-gpu-vulkan-turboquant +- !!merge <<: *turboquant + name: "nvidia-l4t-arm64-turboquant" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-turboquant" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-arm64-turboquant +- !!merge <<: *turboquant + name: "nvidia-l4t-arm64-turboquant-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-turboquant" + mirrors: + - localai/localai-backends:master-nvidia-l4t-arm64-turboquant +- !!merge <<: *turboquant + name: "cuda13-nvidia-l4t-arm64-turboquant" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-turboquant" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-turboquant +- !!merge <<: *turboquant + name: "cuda13-nvidia-l4t-arm64-turboquant-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-turboquant" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-turboquant ## whisper - !!merge <<: *whispercpp name: "nvidia-l4t-arm64-whisper" diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md index 57e36322a..172e50b65 100644 --- a/docs/content/advanced/model-configuration.md +++ b/docs/content/advanced/model-configuration.md @@ -320,8 +320,8 @@ These options apply when using the `vllm` backend: | `disable_log_stats` | bool | Disable logging statistics | | `dtype` | string | Data type (e.g., `float16`, `bfloat16`) | | `flash_attention` | string | Flash attention configuration | -| `cache_type_k` | string | Key cache type | -| `cache_type_v` | string | Value cache type | +| `cache_type_k` | string | Key cache quantization type. Maps to llama.cpp's `-ctk`. Accepted values for llama.cpp-family backends (`llama-cpp`, `ik-llama-cpp`, `turboquant`): `f16`, `f32`, `q8_0`, `q4_0`, `q4_1`, `q5_0`, `q5_1`. The `turboquant` backend additionally accepts `turbo2`, `turbo3`, `turbo4` — the fork's TurboQuant KV-cache schemes. `turbo3`/`turbo4` auto-enable flash_attention. | +| `cache_type_v` | string | Value cache quantization type. Maps to llama.cpp's `-ctv`. Same accepted values as `cache_type_k`. Note: any quantized V cache requires flash_attention to be enabled. | | `limit_mm_per_prompt` | object | Limit multimodal content per prompt: `{image: int, video: int, audio: int}` | ## Template Configuration diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md index bfcbf650a..05efcc8fe 100644 --- a/docs/content/features/text-generation.md +++ b/docs/content/features/text-generation.md @@ -580,6 +580,57 @@ The aliases `ik-llama` and `ik_llama` are also accepted. - [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp) +### turboquant (llama.cpp fork with TurboQuant KV-cache) + +[llama-cpp-turboquant](https://github.com/TheTom/llama-cpp-turboquant) is a `llama.cpp` fork that adds the **TurboQuant KV-cache** quantization scheme. It reuses the upstream `llama.cpp` codebase and ships as a drop-in alternative backend inside LocalAI, sharing the same gRPC server sources as the stock `llama-cpp` backend — so any GGUF model that runs on `llama-cpp` also runs on `turboquant`. + +You would pick `turboquant` when you want **smaller KV-cache memory pressure** (longer contexts on the same VRAM) or to experiment with the fork's quantized KV representations on top of the standard `cache_type_k` / `cache_type_v` knobs already supported by upstream `llama.cpp`. + +#### Features + +- Drop-in GGUF compatibility with upstream `llama.cpp`. +- TurboQuant KV-cache quantization (see fork README for the current set of accepted `cache_type_k` / `cache_type_v` values). +- Same feature surface as the `llama-cpp` backend: text generation, embeddings, tool calls, multimodal via mmproj. +- Available on CPU (AVX/AVX2/AVX512/fallback), NVIDIA CUDA 12/13, AMD ROCm/HIP, Intel SYCL f32/f16, Vulkan, and NVIDIA L4T. + +#### Setup + +`turboquant` ships as a separate container image in the LocalAI backend gallery. Install it like any other backend: + +```bash +local-ai backends install turboquant +``` + +Or pick a specific flavor for your hardware (example tags: `cpu-turboquant`, `cuda12-turboquant`, `cuda13-turboquant`, `rocm-turboquant`, `intel-sycl-f16-turboquant`, `vulkan-turboquant`). + +#### YAML configuration + +To run a model with `turboquant`, set the backend in your model YAML and optionally pick quantized KV-cache types: + +```yaml +name: my-model +backend: turboquant +parameters: + # Relative to the models path + model: file.gguf +# Use TurboQuant's own KV-cache quantization schemes. The fork accepts +# the standard llama.cpp types (f16, f32, q8_0, q4_0, q4_1, q5_0, q5_1) +# and adds three TurboQuant-specific ones: turbo2, turbo3, turbo4. +# turbo3 / turbo4 auto-enable flash_attention (required for turbo K/V) +# and offer progressively more aggressive compression. +cache_type_k: turbo3 +cache_type_v: turbo3 +context_size: 8192 +``` + +The `cache_type_k` / `cache_type_v` fields map to llama.cpp's `-ctk` / `-ctv` flags. The stock `llama-cpp` backend only accepts the standard llama.cpp types — to use `turbo2` / `turbo3` / `turbo4` you need this `turboquant` backend, which is where the fork's TurboQuant code paths actually take effect. Pick `q8_0` here and you're just running stock llama.cpp KV quantization; pick `turbo*` and you're running TurboQuant. + +#### Reference + +- [llama-cpp-turboquant](https://github.com/TheTom/llama-cpp-turboquant) +- [Tracked branch: `feature/turboquant-kv-cache`](https://github.com/TheTom/llama-cpp-turboquant/tree/feature/turboquant-kv-cache) + + ### vLLM [vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference. diff --git a/scripts/changed-backends.js b/scripts/changed-backends.js index 4ef8b1874..7ad770af9 100644 --- a/scripts/changed-backends.js +++ b/scripts/changed-backends.js @@ -27,6 +27,11 @@ function inferBackendPath(item) { if (item.dockerfile.endsWith("ik-llama-cpp")) { return `backend/cpp/ik-llama-cpp/`; } + if (item.dockerfile.endsWith("turboquant")) { + // turboquant is a llama.cpp fork that reuses backend/cpp/llama-cpp sources + // via a thin wrapper Makefile. Changes to either dir should retrigger it. + return `backend/cpp/turboquant/`; + } if (item.dockerfile.endsWith("llama-cpp")) { return `backend/cpp/llama-cpp/`; } @@ -132,7 +137,12 @@ async function getChangedFiles() { // Per-backend boolean outputs for (const [backend, pathPrefix] of allBackendPaths) { - const changed = changedFiles.some(file => file.startsWith(pathPrefix)); + let changed = changedFiles.some(file => file.startsWith(pathPrefix)); + // turboquant reuses backend/cpp/llama-cpp sources via a thin wrapper; + // changes to either directory should retrigger its pipeline. + if (backend === "turboquant" && !changed) { + changed = changedFiles.some(file => file.startsWith("backend/cpp/llama-cpp/")); + } fs.appendFileSync(process.env.GITHUB_OUTPUT, `${backend}=${changed ? 'true' : 'false'}\n`); } })(); diff --git a/tests/e2e-backends/backend_test.go b/tests/e2e-backends/backend_test.go index 1350b4ae7..6b1eb49ec 100644 --- a/tests/e2e-backends/backend_test.go +++ b/tests/e2e-backends/backend_test.go @@ -56,6 +56,9 @@ import ( // BACKEND_TEST_THREADS Override Threads passed to LoadModel (default 4). // BACKEND_TEST_OPTIONS Comma-separated Options[] entries passed to LoadModel, // e.g. "tool_parser:hermes,reasoning_parser:qwen3". +// BACKEND_TEST_CACHE_TYPE_K Sets ModelOptions.CacheTypeKey (llama.cpp -ctk), +// e.g. "q8_0" — exercises KV-cache quantization code paths. +// BACKEND_TEST_CACHE_TYPE_V Sets ModelOptions.CacheTypeValue (llama.cpp -ctv). // BACKEND_TEST_TOOL_PROMPT Override the user prompt for the tools spec // (default: "What's the weather like in Paris, France?"). // BACKEND_TEST_TOOL_NAME Override the function name expected in the tool call @@ -265,15 +268,17 @@ var _ = Describe("Backend container", Ordered, func() { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) defer cancel() res, err := client.LoadModel(ctx, &pb.ModelOptions{ - Model: modelRef, - ModelFile: modelPath, - ContextSize: ctxSize, - Threads: threads, - NGPULayers: 0, - MMap: true, - NBatch: 128, - Options: options, - MMProj: mmprojFile, + Model: modelRef, + ModelFile: modelPath, + ContextSize: ctxSize, + Threads: threads, + NGPULayers: 0, + MMap: true, + NBatch: 128, + Options: options, + MMProj: mmprojFile, + CacheTypeKey: os.Getenv("BACKEND_TEST_CACHE_TYPE_K"), + CacheTypeValue: os.Getenv("BACKEND_TEST_CACHE_TYPE_V"), }) Expect(err).NotTo(HaveOccurred()) Expect(res.GetSuccess()).To(BeTrue(), "LoadModel failed: %s", res.GetMessage())