From 9ca03cf9ccd6ae86795a216c4f377da2290204ca Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 12 Apr 2026 13:51:28 +0200 Subject: [PATCH] feat(backends): add ik-llama-cpp (#9326) * feat(backends): add ik-llama-cpp Signed-off-by: Ettore Di Giacinto * chore: add grpc e2e suite, hook to CI, update README Signed-off-by: Ettore Di Giacinto * Apply suggestion from @mudler Signed-off-by: Ettore Di Giacinto * Apply suggestion from @mudler Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 13 + .github/workflows/bump_deps.yaml | 4 + .github/workflows/test-extra.yml | 36 + Makefile | 46 +- backend/Dockerfile.ik-llama-cpp | 281 ++ backend/cpp/ik-llama-cpp/CMakeLists.txt | 78 + backend/cpp/ik-llama-cpp/Makefile | 167 ++ backend/cpp/ik-llama-cpp/grpc-server.cpp | 2652 +++++++++++++++++ backend/cpp/ik-llama-cpp/package.sh | 58 + .../0001-fix-missing-cstdint-include.patch | 10 + backend/cpp/ik-llama-cpp/prepare.sh | 49 + backend/cpp/ik-llama-cpp/run.sh | 40 + backend/cpp/ik-llama-cpp/utils.hpp | 483 +++ backend/index.yaml | 29 + docs/content/features/text-generation.md | 41 + docs/content/reference/compatibility-table.md | 1 + pkg/model/initializers.go | 5 +- scripts/changed-backends.js | 3 + tests/e2e-backends/backend_test.go | 342 +++ tests/e2e-backends/suite_test.go | 24 + 20 files changed, 4360 insertions(+), 2 deletions(-) create mode 100644 backend/Dockerfile.ik-llama-cpp create mode 100644 backend/cpp/ik-llama-cpp/CMakeLists.txt create mode 100644 backend/cpp/ik-llama-cpp/Makefile create mode 100644 backend/cpp/ik-llama-cpp/grpc-server.cpp create mode 100644 backend/cpp/ik-llama-cpp/package.sh create mode 100644 backend/cpp/ik-llama-cpp/patches/0001-fix-missing-cstdint-include.patch create mode 100644 backend/cpp/ik-llama-cpp/prepare.sh create mode 100644 backend/cpp/ik-llama-cpp/run.sh create mode 100644 backend/cpp/ik-llama-cpp/utils.hpp create mode 100644 tests/e2e-backends/backend_test.go create mode 100644 tests/e2e-backends/suite_test.go diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index e88495d0b..d89ee06bf 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -1945,6 +1945,19 @@ jobs: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-ik-llama-cpp' + runs-on: 'bigger-runner' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "ik-llama-cpp" + dockerfile: "./backend/Dockerfile.ik-llama-cpp" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "0" diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index 211d3e4ab..0e3dd8d96 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -14,6 +14,10 @@ jobs: variable: "LLAMA_VERSION" branch: "master" file: "backend/cpp/llama-cpp/Makefile" + - repository: "ikawrakow/ik_llama.cpp" + variable: "IK_LLAMA_VERSION" + branch: "main" + file: "backend/cpp/ik-llama-cpp/Makefile" - repository: "ggml-org/whisper.cpp" variable: "WHISPER_CPP_VERSION" branch: "master" diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index 0992dfdd9..6b590d156 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -29,6 +29,8 @@ jobs: nemo: ${{ steps.detect.outputs.nemo }} voxcpm: ${{ steps.detect.outputs.voxcpm }} llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }} + llama-cpp: ${{ steps.detect.outputs.llama-cpp }} + ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }} acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }} qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }} voxtral: ${{ steps.detect.outputs.voxtral }} @@ -465,6 +467,40 @@ jobs: - name: Test llama-cpp-quantization run: | make --jobs=5 --output-sync=target -C backend/python/llama-cpp-quantization test + tests-llama-cpp-grpc: + needs: detect-changes + if: needs.detect-changes.outputs.llama-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true' + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - name: Clone + uses: actions/checkout@v6 + with: + submodules: true + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '1.25.4' + - name: Build llama-cpp backend image and run gRPC e2e tests + run: | + make test-extra-backend-llama-cpp + tests-ik-llama-cpp-grpc: + needs: detect-changes + if: needs.detect-changes.outputs.ik-llama-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true' + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - name: Clone + uses: actions/checkout@v6 + with: + submodules: true + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '1.25.4' + - name: Build ik-llama-cpp backend image and run gRPC e2e tests + run: | + make test-extra-backend-ik-llama-cpp tests-acestep-cpp: needs: detect-changes if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true' diff --git a/Makefile b/Makefile index 53457b531..6dce83efd 100644 --- a/Makefile +++ b/Makefile @@ -456,6 +456,47 @@ test-extra: prepare-test-extra $(MAKE) -C backend/python/trl test $(MAKE) -C backend/rust/kokoros test +## +## End-to-end gRPC tests that exercise a built backend container image. +## +## The test suite in tests/e2e-backends is backend-agnostic. You drive it via env +## vars (see tests/e2e-backends/backend_test.go for the full list) and the +## capability-driven harness picks which gRPC RPCs to exercise: +## +## BACKEND_IMAGE Required. Docker image to test, e.g. local-ai-backend:llama-cpp. +## BACKEND_TEST_MODEL_URL URL of a model file to download and load. +## BACKEND_TEST_MODEL_FILE Path to an already-downloaded model (skips download). +## BACKEND_TEST_CAPS Comma-separated capabilities, default "health,load,predict,stream". +## BACKEND_TEST_PROMPT Override the prompt used in predict/stream specs. +## +## Direct usage (image already built, no docker-build-* dependency): +## +## make test-extra-backend BACKEND_IMAGE=local-ai-backend:llama-cpp \ +## BACKEND_TEST_MODEL_URL=https://.../model.gguf +## +## Convenience wrappers below build a specific backend image first, then run the +## suite against it. +## +BACKEND_TEST_MODEL_URL?=https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf + +## Generic target — runs the suite against whatever BACKEND_IMAGE points at. +## Depends on protogen-go so pkg/grpc/proto is generated before `go test`. +test-extra-backend: protogen-go + @test -n "$$BACKEND_IMAGE" || { echo "BACKEND_IMAGE must be set" >&2; exit 1; } + BACKEND_IMAGE="$$BACKEND_IMAGE" \ + BACKEND_TEST_MODEL_URL="$${BACKEND_TEST_MODEL_URL:-$(BACKEND_TEST_MODEL_URL)}" \ + BACKEND_TEST_MODEL_FILE="$$BACKEND_TEST_MODEL_FILE" \ + BACKEND_TEST_CAPS="$$BACKEND_TEST_CAPS" \ + BACKEND_TEST_PROMPT="$$BACKEND_TEST_PROMPT" \ + go test -v -timeout 15m ./tests/e2e-backends/... + +## Convenience wrappers: build the image, then exercise it. +test-extra-backend-llama-cpp: docker-build-llama-cpp + BACKEND_IMAGE=local-ai-backend:llama-cpp $(MAKE) test-extra-backend + +test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp + BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend + DOCKER_IMAGE?=local-ai IMAGE_TYPE?=core BASE_IMAGE?=ubuntu:24.04 @@ -549,6 +590,8 @@ backend-images: # Backend metadata: BACKEND_NAME | DOCKERFILE_TYPE | BUILD_CONTEXT | PROGRESS_FLAG | NEEDS_BACKEND_ARG # llama-cpp is special - uses llama-cpp Dockerfile and doesn't need BACKEND arg BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false +# ik-llama-cpp is a fork of llama.cpp with superior CPU performance +BACKEND_IK_LLAMA_CPP = ik-llama-cpp|ik-llama-cpp|.|false|false # Golang backends BACKEND_PIPER = piper|golang|.|false|true @@ -619,6 +662,7 @@ endef # Generate all docker-build targets $(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP))) +$(eval $(call generate-docker-build-target,$(BACKEND_IK_LLAMA_CPP))) $(eval $(call generate-docker-build-target,$(BACKEND_PIPER))) $(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE))) $(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE))) @@ -663,7 +707,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP))) docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp +docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp ######################################################## ### Mock Backend for E2E Tests diff --git a/backend/Dockerfile.ik-llama-cpp b/backend/Dockerfile.ik-llama-cpp new file mode 100644 index 000000000..62ae52841 --- /dev/null +++ b/backend/Dockerfile.ik-llama-cpp @@ -0,0 +1,281 @@ +ARG BASE_IMAGE=ubuntu:24.04 +ARG GRPC_BASE_IMAGE=${BASE_IMAGE} + + +# The grpc target does one thing, it builds and installs GRPC. This is in it's own layer so that it can be effectively cached by CI. +# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work. +FROM ${GRPC_BASE_IMAGE} AS grpc + +# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI +ARG GRPC_MAKEFLAGS="-j4 -Otarget" +ARG GRPC_VERSION=v1.65.0 +ARG CMAKE_FROM_SOURCE=false +# CUDA Toolkit 13.x compatibility: CMake 3.31.9+ fixes toolchain detection/arch table issues +ARG CMAKE_VERSION=3.31.10 + +ENV MAKEFLAGS=${GRPC_MAKEFLAGS} + +WORKDIR /build + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + build-essential curl libssl-dev \ + git wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Install CMake (the version in 22.04 is too old) +RUN </dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1) +ARCH?=$(shell uname -m) + +# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static +CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF + +CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +ifeq ($(NATIVE),false) + CMAKE_ARGS+=-DGGML_NATIVE=OFF -DLLAMA_OPENSSL=OFF +endif +# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically +ifeq ($(BUILD_TYPE),cublas) + CMAKE_ARGS+=-DGGML_CUDA=ON +# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS +# to CMAKE_ARGS automatically +else ifeq ($(BUILD_TYPE),openblas) + CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS +# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path +else ifeq ($(BUILD_TYPE),clblas) + CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path +# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ +else ifeq ($(BUILD_TYPE),hipblas) + ROCM_HOME ?= /opt/rocm + ROCM_PATH ?= /opt/rocm + export CXX=$(ROCM_HOME)/llvm/bin/clang++ + export CC=$(ROCM_HOME)/llvm/bin/clang + AMDGPU_TARGETS?=gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201 + CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS) +else ifeq ($(BUILD_TYPE),vulkan) + CMAKE_ARGS+=-DGGML_VULKAN=1 +else ifeq ($(OS),Darwin) + ifeq ($(BUILD_TYPE),) + BUILD_TYPE=metal + endif + ifneq ($(BUILD_TYPE),metal) + CMAKE_ARGS+=-DGGML_METAL=OFF + else + CMAKE_ARGS+=-DGGML_METAL=ON + CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON + CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON + CMAKE_ARGS+=-DGGML_OPENMP=OFF + endif + TARGET+=--target ggml-metal +endif + +ifeq ($(BUILD_TYPE),sycl_f16) + CMAKE_ARGS+=-DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx \ + -DCMAKE_CXX_FLAGS="-fsycl" \ + -DGGML_SYCL_F16=ON +endif + +ifeq ($(BUILD_TYPE),sycl_f32) + CMAKE_ARGS+=-DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx \ + -DCMAKE_CXX_FLAGS="-fsycl" +endif + +INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages +INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake +ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \ + -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \ + -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \ + -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \ + -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include +build-ik-llama-cpp-grpc-server: +# Conditionally build grpc for the backend to use if needed +ifdef BUILD_GRPC_FOR_BACKEND_LLAMA + $(MAKE) -C ../../grpc build + _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \ + _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \ + PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \ + CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \ + IK_LLAMA_VERSION=$(IK_LLAMA_VERSION) \ + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server +else + echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined." + IK_LLAMA_VERSION=$(IK_LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server +endif + +ik-llama-cpp-avx2: llama.cpp + cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx2-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx2-build purge + $(info ${GREEN}I ik-llama-cpp build info:avx2${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="ik-llama-cpp-avx2-build" build-ik-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx2-build/grpc-server ik-llama-cpp-avx2 + +ik-llama-cpp-avx512: llama.cpp + cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx512-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx512-build purge + $(info ${GREEN}I ik-llama-cpp build info:avx512${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="ik-llama-cpp-avx512-build" build-ik-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx512-build/grpc-server ik-llama-cpp-avx512 + +ik-llama-cpp-avx: llama.cpp + cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx-build purge + $(info ${GREEN}I ik-llama-cpp build info:avx${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="ik-llama-cpp-avx-build" build-ik-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx-build/grpc-server ik-llama-cpp-avx + +ik-llama-cpp-fallback: llama.cpp + cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-fallback-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-fallback-build purge + $(info ${GREEN}I ik-llama-cpp build info:fallback${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="ik-llama-cpp-fallback-build" build-ik-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-fallback-build/grpc-server ik-llama-cpp-fallback + +ik-llama-cpp-grpc: llama.cpp + cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build purge + $(info ${GREEN}I ik-llama-cpp build info:grpc${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="ik-llama-cpp-grpc-build" build-ik-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build/grpc-server ik-llama-cpp-grpc + +ik-llama-cpp-rpc-server: ik-llama-cpp-grpc + cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server ik-llama-cpp-rpc-server + +llama.cpp: + mkdir -p llama.cpp + cd llama.cpp && \ + git init && \ + git remote add origin $(LLAMA_REPO) && \ + git fetch origin && \ + git checkout -b build $(IK_LLAMA_VERSION) && \ + git submodule update --init --recursive --depth 1 --single-branch + +llama.cpp/examples/grpc-server: llama.cpp + mkdir -p llama.cpp/examples/grpc-server + bash prepare.sh + +rebuild: + bash prepare.sh + rm -rf grpc-server + $(MAKE) grpc-server + +package: + bash package.sh + +purge: + rm -rf llama.cpp/build + rm -rf llama.cpp/examples/grpc-server + rm -rf grpc-server + +clean: purge + rm -rf llama.cpp + +grpc-server: llama.cpp llama.cpp/examples/grpc-server + @echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)" +ifneq (,$(findstring sycl,$(BUILD_TYPE))) + +bash -c "source $(ONEAPI_VARS); \ + cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)" +else + +cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET) +endif + cp llama.cpp/build/bin/grpc-server . diff --git a/backend/cpp/ik-llama-cpp/grpc-server.cpp b/backend/cpp/ik-llama-cpp/grpc-server.cpp new file mode 100644 index 000000000..3e88022dc --- /dev/null +++ b/backend/cpp/ik-llama-cpp/grpc-server.cpp @@ -0,0 +1,2652 @@ +// ik_llama.cpp gRPC C++ backend server +// +// Ettore Di Giacinto and llama.cpp authors +// +// This is a gRPC server for ik_llama.cpp compatible with the LocalAI proto +// Note: this is a re-adaptation of the original llama.cpp example/server.cpp for HTTP (https://github.com/ggerganov/llama.cpp/tree/master/examples/server), +// but modified to work with gRPC +// + +#include +#include +#include +#include +#include "clip.h" +#include "llava.h" +#include "log.h" +#include "common.h" +#include "json.hpp" +#include "llama.h" +#include "backend.pb.h" +#include "backend.grpc.pb.h" +#include "utils.hpp" +#include "sampling.h" +// include std::regex +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using grpc::Server; +using grpc::ServerBuilder; +using grpc::ServerContext; +using grpc::Status; + + +using backend::HealthMessage; + + +///// LLAMA.CPP server code below + +using json = nlohmann::json; + +struct server_params +{ + std::string hostname = "127.0.0.1"; + std::vector api_keys; + std::string public_path = "examples/server/public"; + std::string chat_template = ""; + int32_t port = 8080; + int32_t read_timeout = 600; + int32_t write_timeout = 600; + bool slots_endpoint = true; + bool metrics_endpoint = false; +}; + +bool server_verbose = false; +bool server_log_json = true; + +static size_t common_part(const std::vector &a, const std::vector &b) +{ + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) + { + } + return i; +} + +enum stop_type +{ + STOP_FULL, + STOP_PARTIAL, +}; + +static bool ends_with(const std::string &str, const std::string &suffix) +{ + return str.size() >= suffix.size() && + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); +} + +static size_t find_partial_stop_string(const std::string &stop, + const std::string &text) +{ + if (!text.empty() && !stop.empty()) + { + const char text_last_char = text.back(); + for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) + { + if (stop[char_index] == text_last_char) + { + const std::string current_partial = stop.substr(0, char_index + 1); + if (ends_with(text, current_partial)) + { + return text.size() - char_index - 1; + } + } + } + } + return std::string::npos; +} + +// TODO: reuse llama_detokenize +template +static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) +{ + std::string ret; + for (; begin != end; ++begin) + { + ret += common_token_to_piece(ctx, *begin); + } + return ret; +} + +// format incomplete utf-8 multibyte character for output +static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) +{ + std::string out = token == -1 ? "" : common_token_to_piece(ctx, token); + // if the size is 1 and first bit is 1, meaning it's a partial character + // (size > 1 meaning it's already a known token) + if (out.size() == 1 && (out[0] & 0x80) == 0x80) + { + std::stringstream ss; + ss << std::hex << (out[0] & 0xff); + std::string res(ss.str()); + out = "byte: \\x" + res; + } + return out; +} + +// Adds an RPC server +// NOTE: RPC device API is not available in ik_llama.cpp -- this function is a no-op stub. +static void add_rpc_devices(std::string servers) { + LOG_WARNING("RPC devices are not supported in ik_llama.cpp, ignoring LLAMACPP_GRPC_SERVERS", {}); +} + +// convert a vector of completion_token_output to json +static json probs_vector_to_json(const llama_context *ctx, const std::vector &probs) +{ + json out = json::array(); + for (const auto &prob : probs) + { + json probs_for_token = json::array(); + for (const auto &p : prob.probs) + { + std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); + probs_for_token.push_back(json + { + {"tok_str", tok_str}, + {"prob", p.prob}, + }); + } + std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); + out.push_back(json{ + {"content", tok_str}, + {"probs", probs_for_token}, + }); + } + return out; +} + +struct llama_client_slot +{ + int id; + int task_id = -1; + + struct slot_params params; + + slot_state state = IDLE; + slot_command command = NONE; + + // used to determine the slot that has been used the longest + int64_t t_last_used = -1; + + // generation props + int32_t n_ctx = 0; // context size per slot + int32_t n_past = 0; + int32_t n_decoded = 0; + int32_t n_remaining = -1; + int32_t i_batch = -1; + int32_t n_predict = -1; + + int32_t num_prompt_tokens = 0; + int32_t num_prompt_tokens_processed = 0; + + json prompt; + std::string generated_text; + llama_token sampled; + std::vector cache_tokens; + std::vector generated_token_probs; + + bool infill = false; + bool embedding = false; + bool has_next_token = true; + bool truncated = false; + bool stopped_eos = false; + bool stopped_word = false; + bool stopped_limit = false; + + bool oaicompat = false; + std::string oaicompat_model; + + std::string stopping_word; + + // sampling + struct common_params_sampling sparams; + common_sampler *ctx_sampling = nullptr; + + int32_t ga_i = 0; // group-attention state + int32_t ga_n = 1; // group-attention factor + int32_t ga_w = 512; // group-attention width + + int32_t n_past_se = 0; // self-extend + + // multimodal + std::vector images; + + // stats + size_t sent_count = 0; + size_t sent_token_probs_index = 0; + + int64_t t_start_process_prompt; + int64_t t_start_genereration; + + double t_prompt_processing; // ms + double t_token_generation; // ms + + // multitasks + int multitask_id = -1; + + void reset() { + num_prompt_tokens = 0; + generated_text = ""; + truncated = false; + stopped_eos = false; + stopped_word = false; + stopped_limit = false; + stopping_word = ""; + n_past = 0; + sent_count = 0; + sent_token_probs_index = 0; + infill = false; + ga_i = 0; + n_past_se = 0; + + generated_token_probs.clear(); + + for (slot_image & img : images) + { + free(img.image_embedding); + if (img.img_data) { + clip_image_u8_free(img.img_data); + } + img.prefix_prompt = ""; + } + + images.clear(); + } + + bool has_budget(gpt_params &global_params) { + if (params.n_predict == -1 && global_params.n_predict == -1) + { + return true; // limitless + } + + n_remaining = -1; + + if (params.n_predict != -1) + { + n_remaining = params.n_predict - n_decoded; + } + else if (global_params.n_predict != -1) + { + n_remaining = global_params.n_predict - n_decoded; + } + + return n_remaining > 0; // no budget + } + + bool available() const { + return state == IDLE && command == NONE; + } + + bool is_processing() const { + return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; + } + + void add_token_string(const completion_token_output &token) { + if (command == RELEASE) + { + return; + } + cache_tokens.push_back(token.tok); + generated_token_probs.push_back(token); + } + + void release() { + if (state == PROCESSING) + { + t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; + command = RELEASE; + } + } + + json get_formated_timings() { + return json + { + {"prompt_n", num_prompt_tokens_processed}, + {"prompt_ms", t_prompt_processing}, + {"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed}, + {"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed}, + + {"predicted_n", n_decoded}, + {"predicted_ms", t_token_generation}, + {"predicted_per_token_ms", t_token_generation / n_decoded}, + {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, + }; + } + + void print_timings() const { + char buffer[512]; + double t_token = t_prompt_processing / num_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; + sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", + t_prompt_processing, num_prompt_tokens_processed, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"num_prompt_tokens_processed", num_prompt_tokens_processed}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + t_token = t_token_generation / n_decoded; + n_tokens_second = 1e3 / t_token_generation * n_decoded; + sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + t_token_generation, n_decoded, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_token_generation", t_token_generation}, + {"n_decoded", n_decoded}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"t_token_generation", t_token_generation}, + {"t_total", t_prompt_processing + t_token_generation}, + }); + } +}; + +struct llama_metrics { + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t n_tokens_predicted_total = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + + void on_prompt_eval(const llama_client_slot &slot) { + n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed; + + n_prompt_tokens_processed += slot.num_prompt_tokens_processed; + t_prompt_processing += slot.t_prompt_processing; + } + + void on_prediction(const llama_client_slot &slot) { + n_tokens_predicted_total += slot.n_decoded; + + n_tokens_predicted += slot.n_decoded; + t_tokens_generation += slot.t_token_generation; + } + + void reset_bucket() { + n_prompt_tokens_processed = 0; + t_prompt_processing = 0; + n_tokens_predicted = 0; + t_tokens_generation = 0; + } +}; + +struct llava_embd_batch { + std::vector pos; + std::vector n_seq_id; + std::vector seq_id_0; + std::vector seq_ids; + std::vector logits; + llama_batch batch; + llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { + pos .resize(n_tokens); + n_seq_id.resize(n_tokens); + seq_ids .resize(n_tokens + 1); + logits .resize(n_tokens); + seq_id_0.resize(1); + seq_id_0[0] = seq_id; + seq_ids [n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ embd, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = pos_0 + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } +}; + +struct llama_server_context +{ + llama_model *model = nullptr; + llama_context *ctx = nullptr; + const llama_vocab * vocab = nullptr; + + clip_ctx *clp_ctx = nullptr; + + gpt_params params; + + llama_batch batch; + + bool multimodal = false; + bool clean_kv_cache = true; + bool all_slots_are_idle = false; + bool add_bos_token = true; + bool has_eos_token = true; + bool has_gpu = false; + + bool grammar_lazy = false; + std::vector grammar_triggers; + + int32_t n_ctx; // total context for all clients / slots + + // system prompt + bool system_need_update = false; + + std::string system_prompt; + std::vector system_tokens; + + std::string name_user; // this should be the antiprompt + std::string name_assistant; + + // slots / clients + std::vector slots; + json default_generation_settings_for_props; + + llama_server_queue queue_tasks; + llama_server_response queue_results; + + llama_metrics metrics; + + ~llama_server_context() + { + if (ctx) + { + llama_free(ctx); + ctx = nullptr; + } + if (model) + { + llama_free_model(model); + model = nullptr; + } + } + + bool load_model(const gpt_params ¶ms_) + { + params = params_; + if (!params.mmproj.path.empty()) { + multimodal = true; + LOG_INFO("Multi Modal Mode Enabled", {}); + clp_ctx = clip_model_load(params.mmproj.path.c_str(), /*verbosity=*/ 1); + if(clp_ctx == nullptr) { + LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str()); + return false; + } + + if (params.n_ctx < 2048) { // request larger context for the image embedding + params.n_ctx = 2048; + } + } + + llama_init_result init_result = llama_init_from_gpt_params(params); + model = init_result.model; + ctx = init_result.context; + if (model == nullptr) + { + LOG_ERR("unable to load model: %s", params.model.c_str()); + return false; + } + + if (multimodal) { + const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); + const int n_embd_llm = llama_model_n_embd(model); + if (n_embd_clip != n_embd_llm) { + LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm); + llama_free(ctx); + llama_free_model(model); + return false; + } + } + + vocab = llama_model_get_vocab(model); + n_ctx = llama_n_ctx(ctx); + + add_bos_token = llama_vocab_get_add_bos(vocab); + has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL; + + return true; + } + + llama_client_slot* get_active_slot() { + for (llama_client_slot& slot : slots) { + // Check if the slot is currently processing + if (slot.is_processing()) { + return &slot; // Return the active slot + } + } + return nullptr; // No active slot found + } + + void initialize() { + // create slots + all_slots_are_idle = true; + + const int32_t n_ctx_slot = n_ctx / params.n_parallel; + + LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); + for (int i = 0; i < params.n_parallel; i++) + { + llama_client_slot slot; + + slot.id = i; + slot.n_ctx = n_ctx_slot; + slot.n_predict = params.n_predict; + + LOG_INFO("new slot", { + {"slot_id", slot.id}, + {"n_ctx_slot", slot.n_ctx} + }); + + const int ga_n = params.grp_attn_n; + const int ga_w = params.grp_attn_w; + + if (ga_n != 1) { + GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT + GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT + //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT + //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT + + LOG_INFO("slot self-extend", { + {"slot_id", slot.id}, + {"ga_n", ga_n}, + {"ga_w", ga_w} + }); + } + + slot.ga_i = 0; + slot.ga_n = ga_n; + slot.ga_w = ga_w; + + slot.reset(); + + slots.push_back(slot); + } + + default_generation_settings_for_props = get_formated_generation(slots.front()); + default_generation_settings_for_props["seed"] = -1; + + batch = llama_batch_init(n_ctx, 0, params.n_parallel); + } + + std::vector tokenize(const json & json_prompt, bool add_bos) const + { + // TODO: currently, we tokenize using special tokens by default + // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) + // but it's better compared to completely ignoring ChatML and other chat templates + const bool TMP_FORCE_SPECIAL = true; + + // If `add_bos` is true, we only add BOS, when json_prompt is a string, + // or the first element of the json_prompt array is a string. + std::vector prompt_tokens; + + if (json_prompt.is_array()) + { + bool first = true; + for (const auto& p : json_prompt) + { + if (p.is_string()) + { + auto s = p.template get(); + std::vector p; + if (first) + { + p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); + first = false; + } + else + { + p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); + } + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + } + else + { + if (first) + { + first = false; + } + prompt_tokens.push_back(p.template get()); + } + } + } + else + { + auto s = json_prompt.template get(); + prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); + } + + return prompt_tokens; + } + + llama_client_slot* get_slot(int id) { + int64_t t_last = ggml_time_us(); + llama_client_slot *last_used = nullptr; + + for (llama_client_slot & slot : slots) + { + if (slot.id == id && slot.available()) + { + return &slot; + } + + if (slot.available() && slot.t_last_used < t_last) + { + last_used = &slot; + t_last = slot.t_last_used; + } + } + + return last_used; + } + + bool launch_slot_with_data(llama_client_slot* &slot, json data) { + slot_params default_params; + common_params_sampling default_sparams; + + slot->params.stream = json_value(data, "stream", false); + slot->params.cache_prompt = json_value(data, "cache_prompt", false); + slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); + slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); + slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); + slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); + slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); + slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); + slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); + slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); + slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); + slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); + slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); + slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); + slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); + slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); + slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); + slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); + slot->sparams.seed = json_value(data, "seed", default_sparams.seed); + slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); + slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); + slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); + slot->sparams.grammar_triggers = grammar_triggers; + slot->sparams.grammar_lazy = grammar_lazy; + + if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) { + // Might be better to reject the request with a 400 ? + LOG_WARNING("Max tokens to predict exceeds server configuration", { + {"params.n_predict", slot->params.n_predict}, + {"slot.n_predict", slot->n_predict}, + }); + slot->params.n_predict = slot->n_predict; + } + + // infill + if (data.count("input_prefix") != 0) + { + slot->params.input_prefix = data["input_prefix"]; + } + else + { + slot->params.input_prefix = ""; + } + + + if (data.count("input_suffix") != 0) + { + slot->params.input_suffix = data["input_suffix"]; + } + else + { + slot->params.input_suffix = ""; + } + + if (data.count("prompt") != 0) + { + slot->prompt = data["prompt"]; + } + else + { + slot->prompt = ""; + } + + if (json_value(data, "ignore_eos", false) && has_eos_token) { + slot->sparams.logit_bias[llama_vocab_eos(vocab)] = -INFINITY; + } + /* + slot->sparams.penalty_prompt_tokens.clear(); + slot->sparams.use_penalty_prompt_tokens = false; + const auto &penalty_prompt = data.find("penalty_prompt"); + if (penalty_prompt != data.end()) + { + if (penalty_prompt->is_string()) + { + const auto penalty_prompt_string = penalty_prompt->get(); + auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false); + slot->sparams.penalty_prompt_tokens.swap(penalty_tokens); + if (slot->params.n_predict > 0) + { + slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict); + } + slot->sparams.use_penalty_prompt_tokens = true; + } + else if (penalty_prompt->is_array()) + { + const auto n_tokens = penalty_prompt->size(); + slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict)); + const int n_vocab = llama_n_vocab(model); + for (const auto &penalty_token : *penalty_prompt) + { + if (penalty_token.is_number_integer()) + { + const auto tok = penalty_token.get(); + if (tok >= 0 && tok < n_vocab) + { + slot->sparams.penalty_prompt_tokens.push_back(tok); + } + } + } + slot->sparams.use_penalty_prompt_tokens = true; + } + } + */ + slot->sparams.logit_bias.clear(); + + const auto &logit_bias = data.find("logit_bias"); + if (logit_bias != data.end() && logit_bias->is_array()) + { + const llama_vocab * vocab = llama_model_get_vocab(model); + const int n_vocab = llama_vocab_n_tokens(vocab); + for (const auto &el : *logit_bias) + { + if (el.is_array() && el.size() == 2) + { + float bias; + if (el[1].is_number()) + { + bias = el[1].get(); + } + else if (el[1].is_boolean() && !el[1].get()) + { + bias = -INFINITY; + } + else + { + continue; + } + + if (el[0].is_number_integer()) + { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) + { + slot->sparams.logit_bias[tok] = bias; + } + } + else if (el[0].is_string()) + { + auto toks = common_tokenize(ctx, el[0].get(), false, false); + for (auto tok : toks) + { + slot->sparams.logit_bias[tok] = bias; + } + } + } + } + } + + slot->params.antiprompt.clear(); + + const auto &stop = data.find("stop"); + if (stop != data.end() && stop->is_array()) + { + for (const auto &word : *stop) + { + if (!word.empty()) + { + slot->params.antiprompt.push_back(word); + } + } + } + + const auto & samplers = data.find("samplers"); + if (samplers != data.end() && samplers->is_array()) { + std::vector sampler_names; + for (const auto & name : *samplers) { + if (name.is_string()) { + sampler_names.emplace_back(name); + } + } + slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false); + } + else + { + slot->sparams.samplers_sequence = default_sparams.samplers_sequence; + } + + + if (multimodal) + { + const auto &images_data = data.find("image_data"); + if (images_data != data.end() && images_data->is_array()) + { + for (const auto &img : *images_data) + { + const std::vector image_buffer = base64_decode(img["data"].get()); + + slot_image img_sl; + img_sl.id = img.count("id") != 0 ? img["id"].get() : slot->images.size(); + img_sl.img_data = clip_image_u8_init(); + if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) + { + LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d", + __func__, + slot->id, + img_sl.id + ); + return false; + } + LOG_VERBOSE("image loaded", { + {"slot_id", slot->id}, + {"img_sl_id", img_sl.id} + }); + img_sl.request_encode_image = true; + slot->images.push_back(img_sl); + } + // process prompt + // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]} + if (slot->images.size() > 0 && !slot->prompt.is_array()) + { + std::string prompt = slot->prompt.get(); + size_t pos = 0, begin_prefix = 0; + std::string pattern = "[img-"; + while ((pos = prompt.find(pattern, pos)) != std::string::npos) { + size_t end_prefix = pos; + pos += pattern.length(); + size_t end_pos = prompt.find(']', pos); + if (end_pos != std::string::npos) + { + std::string image_id = prompt.substr(pos, end_pos - pos); + try + { + int img_id = std::stoi(image_id); + bool found = false; + for (slot_image &img : slot->images) + { + if (img.id == img_id) { + found = true; + img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix); + begin_prefix = end_pos + 1; + break; + } + } + if (!found) { + LOG("ERROR: Image with id: %i, not found.\n", img_id); + slot->images.clear(); + return false; + } + } catch (const std::invalid_argument& e) { + LOG("Invalid image number id in prompt\n"); + slot->images.clear(); + return false; + } + } + } + slot->prompt = ""; + slot->params.input_suffix = prompt.substr(begin_prefix); + slot->params.cache_prompt = false; // multimodal doesn't support cache prompt + } + } + } + + if (slot->ctx_sampling != nullptr) + { + common_sampler_free(slot->ctx_sampling); + } + slot->ctx_sampling = common_sampler_init(model, slot->sparams); + //llama_set_rng_seed(ctx, slot->params.seed); + slot->command = LOAD_PROMPT; + + all_slots_are_idle = false; + + LOG_INFO("slot is processing task", { + {"slot_id", slot->id}, + {"task_id", slot->task_id}, + }); + + // LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); + + return true; + } + + void kv_cache_clear() { + // clear the entire KV cache + llama_kv_cache_clear(ctx); + clean_kv_cache = false; + } + + void update_system_prompt() { + kv_cache_clear(); + system_tokens.clear(); + + if (!system_prompt.empty()) { + system_tokens = common_tokenize(ctx, system_prompt, add_bos_token); + + common_batch_clear(batch); + + for (int i = 0; i < (int)system_tokens.size(); ++i) + { + common_batch_add(batch, system_tokens[i], i, { 0 }, false); + } + + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch) + { + const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i)); + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + }; + if (llama_decode(ctx, batch_view) != 0) + { + LOG("%s: llama_decode() failed\n", __func__); + return; + } + } + + // assign the system KV cache to all parallel sequences + for (int32_t i = 1; i < params.n_parallel; ++i) + { + llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); + } + } + + LOG("system prompt updated\n"); + system_need_update = false; + } + + void notify_system_prompt_changed() { + // release all slots + for (llama_client_slot &slot : slots) + { + slot.release(); + } + + system_need_update = true; + } + + void process_system_prompt_data(const json &sys_props) { + system_prompt = sys_props.value("prompt", ""); + name_user = sys_props.value("anti_prompt", ""); + name_assistant = sys_props.value("assistant_name", ""); + + + notify_system_prompt_changed(); + } + + static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, + const stop_type type, llama_client_slot &slot) + { + size_t stop_pos = std::string::npos; + + for (const std::string &word : slot.params.antiprompt) + { + size_t pos; + if (type == STOP_FULL) + { + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + pos = text.find(word, from_pos); + } + else + { + pos = find_partial_stop_string(word, text); + } + if (pos != std::string::npos && + (stop_pos == std::string::npos || pos < stop_pos)) + { + if (type == STOP_FULL) + { + slot.stopped_word = true; + slot.stopping_word = word; + slot.has_next_token = false; + } + stop_pos = pos; + } + } + + return stop_pos; + } + + bool process_token(completion_token_output &result, llama_client_slot &slot) { + // remember which tokens were sampled - used for repetition penalties during sampling + const std::string token_str = common_token_to_piece(ctx, result.tok); + slot.sampled = result.tok; + + // search stop word and delete it + slot.generated_text += token_str; + slot.has_next_token = true; + +/* + if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) + { + // we can change penalty_prompt_tokens because it is always created from scratch each request + slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); + } + */ + + // check if there is incomplete UTF-8 character at the end + bool incomplete = false; + for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) + { + unsigned char c = slot.generated_text[slot.generated_text.size() - i]; + if ((c & 0xC0) == 0x80) + { + // continuation byte: 10xxxxxx + continue; + } + if ((c & 0xE0) == 0xC0) + { + // 2-byte character: 110xxxxx ... + incomplete = i < 2; + } + else if ((c & 0xF0) == 0xE0) + { + // 3-byte character: 1110xxxx ... + incomplete = i < 3; + } + else if ((c & 0xF8) == 0xF0) + { + // 4-byte character: 11110xxx ... + incomplete = i < 4; + } + // else 1-byte character or invalid byte + break; + } + + if (!incomplete) + { + size_t pos = std::min(slot.sent_count, slot.generated_text.size()); + const std::string str_test = slot.generated_text.substr(pos); + bool is_stop_full = false; + size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); + if (stop_pos != std::string::npos) + { + is_stop_full = true; + slot.generated_text.erase( + slot.generated_text.begin() + pos + stop_pos, + slot.generated_text.end()); + pos = std::min(slot.sent_count, slot.generated_text.size()); + } + else + { + is_stop_full = false; + stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot); + } + + // check if there is any token to predict + if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) + { + // no send the stop word in the response + result.text_to_send = slot.generated_text.substr(pos, std::string::npos); + slot.sent_count += result.text_to_send.size(); + // add the token to slot queue and cache + } + slot.add_token_string(result); + if (slot.params.stream) + { + send_partial_response(slot, result); + } + } + + if (incomplete) + { + slot.has_next_token = true; + } + + // check the limits + if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) + { + slot.stopped_limit = true; + slot.has_next_token = false; + } + + if (slot.n_past >= slot.n_ctx) { + slot.truncated = true; + slot.stopped_limit = true; + slot.has_next_token = false; + + LOG_VERBOSE("stopped due to running out of context capacity", {}); + } + + if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok)) + { + slot.stopped_eos = true; + slot.has_next_token = false; + LOG_VERBOSE("eos token found", {}); + } + + LOG_VERBOSE("next token", { + {"token", result.tok}, + {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, + {"num_tokens_predicted", slot.n_decoded}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + }); + + return slot.has_next_token; // continue + } + + bool process_images(llama_client_slot &slot) const + { + for (slot_image &img : slot.images) + { + if (!img.request_encode_image) + { + continue; + } + + if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { + LOG("Error processing the given image"); + return false; + } + + img.request_encode_image = false; + } + + return slot.images.size() > 0; + } + + void send_error(task_server& task, const std::string &error) + { + LOG("task %i - error: %s\n", task.id, error.c_str()); + task_result res; + res.id = task.id; + res.multitask_id = task.multitask_id; + res.stop = false; + res.error = true; + res.result_json = { { "content", error } }; + queue_results.send(res); + } + + json get_formated_generation(llama_client_slot &slot) + { + std::vector samplers; + samplers.reserve(slot.sparams.samplers_sequence.size()); + for (const auto & sampler : slot.sparams.samplers_sequence) + { + samplers.emplace_back(llama_sampling_type_to_str(sampler)); + } + + return json { + {"n_ctx", slot.n_ctx}, + {"n_predict", slot.n_predict}, + {"model", params.model_alias}, + {"seed", slot.params.seed}, + {"temperature", slot.sparams.temp}, + {"dynatemp_range", slot.sparams.dynatemp_range}, + {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, + {"top_k", slot.sparams.top_k}, + {"top_p", slot.sparams.top_p}, + {"min_p", slot.sparams.min_p}, + {"typical_p", slot.sparams.typical_p}, + {"repeat_last_n", slot.sparams.penalty_last_n}, + {"repeat_penalty", slot.sparams.penalty_repeat}, + {"presence_penalty", slot.sparams.penalty_present}, + {"frequency_penalty", slot.sparams.penalty_freq}, + {"mirostat", slot.sparams.mirostat}, + {"mirostat_tau", slot.sparams.mirostat_tau}, + {"mirostat_eta", slot.sparams.mirostat_eta}, + {"stop", slot.params.antiprompt}, + {"n_predict", slot.params.n_predict}, + {"n_keep", params.n_keep}, + {"stream", slot.params.stream}, + // {"logit_bias", slot.sparams.logit_bias}, + {"n_probs", slot.sparams.n_probs}, + {"min_keep", slot.sparams.min_keep}, + {"grammar", slot.sparams.grammar}, + {"samplers", samplers} + }; + } + + void send_partial_response(llama_client_slot &slot, completion_token_output tkn) + { + task_result res; + res.id = slot.task_id; + res.multitask_id = slot.multitask_id; + res.error = false; + res.stop = false; + + res.result_json = json + { + {"content", tkn.text_to_send}, + {"stop", false}, + {"slot_id", slot.id}, + {"multimodal", multimodal} + }; + + if (slot.sparams.n_probs > 0) + { + std::vector probs_output = {}; + const std::vector to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); + size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size()); + size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size()); + if (probs_pos < probs_stop_pos) + { + probs_output = std::vector(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos); + } + slot.sent_token_probs_index = probs_stop_pos; + res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); + } + + if (slot.oaicompat) + { + res.result_json["oaicompat_token_ctr"] = slot.n_decoded; + res.result_json["model"] = slot.oaicompat_model; + } + + queue_results.send(res); + } + + void send_final_response(llama_client_slot &slot) + { + task_result res; + res.id = slot.task_id; + res.multitask_id = slot.multitask_id; + res.error = false; + res.stop = true; + + res.result_json = json + { + {"content", !slot.params.stream ? slot.generated_text : ""}, + {"slot_id", slot.id}, + {"stop", true}, + {"model", params.model_alias}, + {"tokens_predicted", slot.n_decoded}, + {"tokens_evaluated", slot.num_prompt_tokens}, + {"generation_settings", get_formated_generation(slot)}, + {"prompt", slot.prompt}, + {"truncated", slot.truncated}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + {"tokens_cached", slot.n_past}, + {"timings", slot.get_formated_timings()} + }; + + if (slot.sparams.n_probs > 0) + { + std::vector probs = {}; + if (!slot.params.stream && slot.stopped_word) + { + const std::vector stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); + probs = std::vector(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size()); + } + else + { + probs = std::vector( + slot.generated_token_probs.begin(), + slot.generated_token_probs.end()); + } + res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs); + } + + if (slot.oaicompat) + { + res.result_json["oaicompat_token_ctr"] = slot.n_decoded; + res.result_json["model"] = slot.oaicompat_model; + } + + queue_results.send(res); + } + + void send_embedding(llama_client_slot &slot, const llama_batch & batch) + { + task_result res; + res.id = slot.task_id; + res.multitask_id = slot.multitask_id; + res.error = false; + res.stop = true; + + const int n_embd = llama_model_n_embd(model); + if (!params.embedding) + { + LOG_WARNING("embedding disabled", { + {"params.embedding", params.embedding}, + }); + res.result_json = json + { + {"embedding", std::vector(n_embd, 0.0f)}, + }; + } + else + { + const float *data = llama_get_embeddings(ctx); + std::vector embd_res(n_embd, 0.0f); + std::vector> embedding; + for (int i = 0; i < batch.n_tokens; ++i) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { + continue; + } + + const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + if (embd == NULL) { + embd = llama_get_embeddings_ith(ctx, i); + } + + if (embd == NULL) { + LOG("failed to get embeddings"); + + continue; + } + + // normalize only when there is pooling + // TODO: configurable + if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) { + common_embd_normalize(embd, embd_res.data(), n_embd, 2); + embedding.push_back(embd_res); + } else { + embedding.push_back({ embd, embd + n_embd }); + } + } + + // OAI compat + res.result_json = json + { + {"embedding", embedding[0] }, + }; + } + queue_results.send(res); + } + + void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id) + { + task_server task; + task.id = task_id; + task.target_id = 0; + task.data = std::move(data); + task.infill_mode = infill; + task.embedding_mode = embedding; + task.type = TASK_TYPE_COMPLETION; + task.multitask_id = multitask_id; + + // when a completion task's prompt array is not a singleton, we split it into multiple requests + // otherwise, it's a single-prompt task, we actually queue it + // if there's numbers in the prompt array it will be treated as an array of tokens + if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) { + bool numbers = false; + for (const auto& e : task.data.at("prompt")) { + if (e.is_number()) { + numbers = true; + break; + } + } + + // NOTE: split_multiprompt_task() does not handle a mix of strings and numbers, + // it will completely stall the server. I don't know where the bug for this is. + // + // if there are numbers, it needs to be treated like a single prompt, + // queue_tasks handles a mix of strings and numbers just fine. + if (numbers) { + queue_tasks.post(task); + } else { + split_multiprompt_task(task_id, task); + } + } else { + queue_tasks.post(task); + } + } + + // for multiple images processing + bool ingest_images(llama_client_slot &slot, int n_batch) + { + int image_idx = 0; + + while (image_idx < (int) slot.images.size()) + { + slot_image &img = slot.images[image_idx]; + + // process prefix prompt + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) + { + const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + }; + if (llama_decode(ctx, batch_view)) + { + LOG("%s : failed to eval\n", __func__); + return false; + } + } + + // process image with llm + for (int i = 0; i < img.image_tokens; i += n_batch) + { + int n_eval = img.image_tokens - i; + if (n_eval > n_batch) + { + n_eval = n_batch; + } + + const int n_embd = llama_model_n_embd(model); + float * embd = img.image_embedding + i * n_embd; + llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0); + if (llama_decode(ctx, llava_batch.batch)) + { + LOG("%s : failed to eval image\n", __func__); + return false; + } + slot.n_past += n_eval; + } + image_idx++; + + common_batch_clear(batch); + + // append prefix of next image + const auto json_prompt = (image_idx >= (int) slot.images.size()) ? + slot.params.input_suffix : // no more images, then process suffix prompt + (json)(slot.images[image_idx].prefix_prompt); + + std::vector append_tokens = tokenize(json_prompt, false); // has next image + for (int i = 0; i < (int) append_tokens.size(); ++i) + { + common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true); + slot.n_past += 1; + } + } + + return true; + } + + void request_cancel(int task_id) + { + task_server task; + task.type = TASK_TYPE_CANCEL; + task.target_id = task_id; + queue_tasks.post(task); + } + + void split_multiprompt_task(int multitask_id, task_server& multiprompt_task) + { + int prompt_count = multiprompt_task.data.at("prompt").size(); + if (prompt_count <= 1) { + send_error(multiprompt_task, "error while handling multiple prompts"); + return; + } + + // generate all the ID for subtask + std::vector subtask_ids(prompt_count); + for (int i = 0; i < prompt_count; i++) + { + subtask_ids[i] = queue_tasks.get_new_id(); + } + + // queue up the multitask so we can track its subtask progression + queue_tasks.add_multitask(multitask_id, subtask_ids); + + // add subtasks + for (int i = 0; i < prompt_count; i++) + { + json subtask_data = multiprompt_task.data; + subtask_data["prompt"] = subtask_data["prompt"][i]; + + // subtasks inherit everything else (infill mode, embedding mode, etc.) + request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id); + } + } + + void process_single_task(task_server& task) + { + switch (task.type) + { + case TASK_TYPE_COMPLETION: { + llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); + if (slot == nullptr) + { + // if no slot is available, we defer this task for processing later + LOG_VERBOSE("no slot is available", {{"task_id", task.id}}); + queue_tasks.defer(task); + break; + } + + if (task.data.contains("system_prompt")) + { + if (!all_slots_are_idle) { + send_error(task, "system prompt can only be updated when all slots are idle"); + break; + } + process_system_prompt_data(task.data["system_prompt"]); + + // reset cache_tokens for all slots + for (llama_client_slot &slot : slots) + { + slot.cache_tokens.clear(); + slot.n_past = 0; + slot.n_past_se = 0; + } + } + + slot->reset(); + + slot->infill = task.infill_mode; + slot->embedding = task.embedding_mode; + slot->task_id = task.id; + slot->multitask_id = task.multitask_id; + + if (!launch_slot_with_data(slot, task.data)) + { + // send error result + send_error(task, "internal_error"); + break; + } + } break; + case TASK_TYPE_CANCEL: { // release slot linked with the task id + for (auto & slot : slots) + { + if (slot.task_id == task.target_id) + { + slot.release(); + break; + } + } + } break; + case TASK_TYPE_NEXT_RESPONSE: { + // do nothing + } break; + } + } + + void on_finish_multitask(task_multi& multitask) + { + // all subtasks done == multitask is done + task_result result; + result.id = multitask.id; + result.stop = true; + result.error = false; + + // collect json results into one json result + std::vector result_jsons; + for (auto& subres : multitask.results) + { + result_jsons.push_back(subres.result_json); + result.error = result.error && subres.error; + } + result.result_json = json{ { "results", result_jsons } }; + queue_results.send(result); + } + + bool update_slots() { + if (system_need_update) + { + LOG_INFO("updating system prompt", {}); + update_system_prompt(); + } + + common_batch_clear(batch); + + if (all_slots_are_idle) + { + if (system_prompt.empty() && clean_kv_cache) + { + LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {}); + kv_cache_clear(); + } + return true; + } + + LOG_VERBOSE("posting NEXT_RESPONSE", {}); + task_server task; + task.type = TASK_TYPE_NEXT_RESPONSE; + task.target_id = -1; + queue_tasks.post(task); + + for (llama_client_slot &slot : slots) + { + if (slot.ga_n == 1) + { + if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx) + { + // this check is redundant (for good) + // we should never get here, because generation should already stopped in process_token() + + // START LOCALAI changes + // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969) + // See: https://github.com/mudler/LocalAI/issues/1333 + // Context is exhausted, release the slot + slot.release(); + send_final_response(slot); + slot.has_next_token = false; + LOG_ERROR("context is exhausted, release the slot", {}); + + continue; + // END LOCALAI changes + } + } + } + + // decode any currently ongoing sequences + LOG_VERBOSE("decoding ongoing sequences", {}); + for (auto & slot : slots) + { + // release the slot + if (slot.command == RELEASE) + { + slot.state = IDLE; + slot.command = NONE; + slot.t_last_used = ggml_time_us(); + + LOG_INFO("slot released", { + {"slot_id", slot.id}, + {"task_id", slot.task_id}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()}, + {"truncated", slot.truncated} + }); + queue_tasks.notify_slot_changed(); + + continue; + } + + if (slot.state == IDLE) + { + continue; + } + + slot.i_batch = batch.n_tokens; + + const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; + + // TODO: we always have to take into account the "system_tokens" + // this is not great and needs to be improved somehow + common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true); + slot.n_past += 1; + } + + // process in chunks of params.n_batch + int32_t n_batch = params.n_batch; + + // assign workload to the slots + if (params.cont_batching || batch.n_tokens == 0) + { + for (auto & slot : slots) + { + const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()) || !slot.images.empty(); + + // empty prompt passed -> release the slot and send empty response + // note: infill mode allows empty prompt + if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill) + { + slot.release(); + slot.print_timings(); + send_final_response(slot); + continue; + } + + // need process the prompt + if (slot.state == IDLE && slot.command == LOAD_PROMPT) + { + slot.state = PROCESSING; + slot.command = NONE; + std::vector prompt_tokens; + slot.t_start_process_prompt = ggml_time_us(); + slot.t_start_genereration = 0; + + if (slot.infill) + { + bool suff_rm_leading_spc = true; + if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) + { + params.input_suffix.erase(0, 1); + suff_rm_leading_spc = false; + } + auto prefix_tokens = tokenize(slot.params.input_prefix, false); + auto suffix_tokens = tokenize(slot.params.input_suffix, false); + + const int space_token = 29871; // TODO: this should not be hardcoded + if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) { + suffix_tokens.erase(suffix_tokens.begin()); + } + + prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); + prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS + prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); + prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); + prefix_tokens.push_back(llama_token_middle(model)); + prompt_tokens = prefix_tokens; + } + else + { + prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt + } + + slot.num_prompt_tokens = prompt_tokens.size(); + + if (slot.params.n_keep < 0) + { + slot.params.n_keep = slot.num_prompt_tokens; + } + slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); + + // if input prompt is too big, truncate it + if (slot.num_prompt_tokens >= slot.n_ctx) + { + const int n_left = slot.n_ctx - slot.params.n_keep; + const int n_block_size = n_left / 2; + const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; + + std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep); + new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); + + LOG_VERBOSE("input truncated", { + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_left", n_left}, + {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, + }); + slot.truncated = true; + prompt_tokens = new_tokens; + + slot.num_prompt_tokens = prompt_tokens.size(); + GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); + } + + if (!slot.params.cache_prompt) + { + common_sampler_reset(slot.ctx_sampling); + + slot.n_past = 0; + slot.n_past_se = 0; + slot.ga_i = 0; + slot.num_prompt_tokens_processed = slot.num_prompt_tokens; + } + else + { + // push the prompt into the sampling context (do not apply grammar) + for (auto &token : prompt_tokens) + { + common_sampler_accept(slot.ctx_sampling, ctx, token, false); + } + + slot.n_past = common_part(slot.cache_tokens, prompt_tokens); + + // the last token of the cache is not in the KV cache until the next call to llama_decode + // (it was sampled, pushed into the "cache_tokens", but not yet put in the context) + if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size()) + { + slot.n_past -= 1; + } + + slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; + + if (slot.ga_n != 1) + { + int ga_i = 0; + int32_t ga_n = slot.ga_n; + int32_t ga_w = slot.ga_w; + int32_t slot_npast = 0; + for (int k = 0; k < slot.n_past; ++k) + { + while (slot_npast >= ga_i + ga_w) { + const int bd = (ga_w/ga_n)*(ga_n - 1); + slot_npast -= bd; + ga_i += ga_w/ga_n; + } + slot_npast++; + } + slot.n_past_se = slot_npast; + slot.ga_i = ga_i; + } + + LOG_INFO("slot progression", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "n_past", slot.n_past }, + { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed } + }); + } + + slot.cache_tokens = prompt_tokens; + + if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) + { + // we have to evaluate at least 1 token to generate logits. + LOG_INFO("we have to evaluate at least 1 token to generate logits", { + { "slot_id", slot.id }, + { "task_id", slot.task_id } + }); + slot.n_past--; + if (slot.ga_i > 0) + { + slot.n_past_se--; + } + } + + int p0 = (int) system_tokens.size() + slot.n_past; + LOG_INFO("kv cache rm [p0, end)", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "p0", p0 } + }); + llama_kv_cache_seq_rm(ctx, slot.id, p0, -1); + + LOG_VERBOSE("prompt ingested", { + {"n_past", slot.n_past}, + {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)}, + {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())}, + }); + + const bool has_images = process_images(slot); + + // process the prefix of first image + std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens; + + int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; + + int32_t ga_i = slot.ga_i; + int32_t ga_n = slot.ga_n; + int32_t ga_w = slot.ga_w; + + for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past) + { + if (slot.ga_n != 1) + { + while (slot_npast >= ga_i + ga_w) { + const int bd = (ga_w/ga_n)*(ga_n - 1); + slot_npast -= bd; + ga_i += ga_w/ga_n; + } + } + common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false); + slot_npast++; + } + + if (has_images && !ingest_images(slot, n_batch)) + { + LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d", + __func__, + slot.id, + slot.task_id + ); + // FIXME @phymbert: to be properly tested + // early returning without changing the slot state will block the slot for ever + // no one at the moment is checking the return value + return false; + } + + // extract the logits only for the last token + if (batch.n_tokens > 0) + { + batch.logits[batch.n_tokens - 1] = true; + } + + slot.n_decoded = 0; + slot.i_batch = batch.n_tokens - 1; + } + } + } + + if (batch.n_tokens == 0) + { + all_slots_are_idle = true; + return true; + } + + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) + { + const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + + for (auto & slot : slots) + { + if (slot.ga_n != 1) + { + // context extension via Self-Extend + while (slot.n_past_se >= slot.ga_i + slot.ga_w) + { + const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; + const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); + const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; + + LOG("\n"); + LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); + LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); + LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); + + llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); + llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); + llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd); + + slot.n_past_se -= bd; + + slot.ga_i += slot.ga_w / slot.ga_n; + + LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); + } + slot.n_past_se += n_tokens; + } + } + + llama_batch batch_view = + { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + }; + + const int ret = llama_decode(ctx, batch_view); + + if (ret != 0) + { + if (n_batch == 1 || ret < 0) + { + // if you get here, it means the KV cache is full - try increasing it via the context size + LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); + return false; + } + + LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2); + + // retry with half the batch size to try to find a free slot in the KV cache + n_batch /= 2; + i -= n_batch; + continue; + } + + for (auto & slot : slots) + { + if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) + { + continue; + } + + // prompt evaluated for embedding + if (slot.embedding) + { + send_embedding(slot, batch_view); + slot.release(); + slot.i_batch = -1; + continue; + } + + completion_token_output result; + const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i); + + common_sampler_accept(slot.ctx_sampling, ctx, id, true); + + slot.n_decoded += 1; + if (slot.n_decoded == 1) + { + slot.t_start_genereration = ggml_time_us(); + slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slot); + } + + result.tok = id; + const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling); + + for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { + result.probs.push_back({ + cur_p->data[i].id, + i >= cur_p->size ? 0.0f : cur_p->data[i].p, + }); + } + + if (!process_token(result, slot)) + { + slot.release(); + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + } + + slot.i_batch = -1; + } + } + + LOG_VERBOSE("slots updated", {}); + return true; + } + + void run_on_all_tasks_finished() { + update_slots(); + } +}; + +/* llama.cpp completion api semantics */ +static json format_partial_response( + llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector &probs +) { + json res = json + { + {"content", content }, + {"stop", false}, + {"slot_id", slot->id }, + {"multimodal", llama.multimodal } + }; + + if (slot->sparams.n_probs > 0) + { + res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); + } + + return res; +} + +struct token_translator +{ + llama_context * ctx; + std::string operator()(llama_token tok) const { return common_token_to_piece(ctx, tok); } + std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } +}; + +static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot) +{ + auto & gtps = slot->generated_token_probs; + auto translator = token_translator{llama.ctx}; + auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); }; + const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen); + if (slot->generated_text.capacity() < slot->generated_text.size() + len) + { + slot->generated_text.reserve(slot->generated_text.size() + len); + } + for (const completion_token_output & cto : gtps) + { + slot->generated_text += translator(cto); + } +} + +std::function shutdown_handler; + +inline void signal_handler(int signal) { + exit(1); +} + + +///////////////////////////////// +//////////////////////////////// +//////// LOCALAI code starts below here +///////////////////////////////// +//////////////////////////////// + +bool loaded_model; // TODO: add a mutex for this, but happens only once loading the model + +// The class has a llama instance that is shared across all RPCs +llama_server_context llama; + +static void start_llama_server() { + // Wait for model to be loaded first + while (!loaded_model) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + llama.queue_tasks.on_new_task(std::bind( + &llama_server_context::process_single_task, &llama, std::placeholders::_1)); + llama.queue_tasks.on_finish_multitask(std::bind( + &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1)); + llama.queue_tasks.on_all_tasks_finished(std::bind( + &llama_server_context::run_on_all_tasks_finished, &llama)); + llama.queue_results.on_multitask_update(std::bind( + &llama_server_queue::update_multitask, + &llama.queue_tasks, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3 + )); + llama.queue_tasks.start_loop(); +} + +json parse_options(bool streaming, const backend::PredictOptions* predict, llama_server_context &llama) +{ + + // This is for example a slot data from the json data + // slot->params.stream = json_value(data, "stream", false); + // slot->params.cache_prompt = json_value(data, "cache_prompt", false); + // slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); + // slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); + // slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); + // slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); + // slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); + // slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); + // slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); + // slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); + // slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); + // slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); + // slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); + // slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); + // slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); + // slot->params.seed = json_value(data, "seed", default_params.seed); + // slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); + // slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); + + // Create now a json data from the prediction options instead + // + json data; + data["stream"] = streaming; + data["cache_prompt"] = predict->promptcacheall(); + data["n_predict"] = predict->tokens() == 0 ? -1 : predict->tokens(); + data["top_k"] = predict->topk(); + data["top_p"] = predict->topp(); + data["typical_p"] = predict->typicalp(); + data["temperature"] = predict->temperature(); + data["repeat_last_n"] = predict->repeat(); + data["repeat_penalty"] = predict->penalty(); + data["frequency_penalty"] = predict->frequencypenalty(); + data["presence_penalty"] = predict->presencepenalty(); + data["mirostat"] = predict->mirostat(); + data["mirostat_tau"] = predict->mirostattau(); + data["mirostat_eta"] = predict->mirostateta(); + data["n_keep"] = predict->nkeep(); + data["seed"] = predict->seed(); + data["grammar"] = predict->grammar(); + data["prompt"] = predict->prompt(); + data["ignore_eos"] = predict->ignoreeos(); + data["embeddings"] = predict->embeddings(); + + // Add the correlationid to json data + data["correlation_id"] = predict->correlationid(); + + // for each image in the request, add the image data + // + for (int i = 0; i < predict->images_size(); i++) { + data["image_data"].push_back(json + { + {"id", i}, + {"data", predict->images(i)}, + }); + } + + data["stop"] = predict->stopprompts(); + // data["n_probs"] = predict->nprobs(); + //TODO: images, + + return data; +} + +// static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama) +// { +// // https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L673 +// gpt_params default_params; + +// llama.stream = streaming; +// llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens(); +// llama.params.sparams.top_k = predict->topk(); +// llama.params.sparams.top_p = predict->topp(); +// llama.params.sparams.typical_p = predict->typicalp(); +// llama.params.sparams.penalty_last_n = predict->repeat(); +// llama.params.sparams.temp = predict->temperature(); +// llama.params.sparams.penalty_repeat = predict->penalty(); +// llama.params.sparams.penalty_present = predict->presencepenalty(); +// llama.params.sparams.penalty_freq = predict->frequencypenalty(); +// llama.params.sparams.mirostat = predict->mirostat(); +// llama.params.sparams.mirostat_tau = predict->mirostattau(); +// llama.params.sparams.mirostat_eta = predict->mirostateta(); +// llama.params.n_keep = predict->nkeep(); +// llama.params.seed = predict->seed(); +// llama.params.sparams.grammar = predict->grammar(); +// // llama.params.n_probs = predict-> +// llama.params.prompt = predict->prompt(); + +// llama.params.sparams.logit_bias.clear(); + +// if (predict->ignoreeos()) +// { +// llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY; +// } + +// // const auto &logit_bias = body.find("logit_bias"); +// // if (logit_bias != body.end() && logit_bias->is_array()) +// // { +// // const int n_vocab = llama_n_vocab(llama.model); +// // for (const auto &el : *logit_bias) +// // { +// // if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) +// // { +// // llama_token tok = el[0].get(); +// // if (tok >= 0 && tok < n_vocab) +// // { +// // if (el[1].is_number()) +// // { +// // llama.params.logit_bias[tok] = el[1].get(); +// // } +// // else if (el[1].is_boolean() && !el[1].get()) +// // { +// // llama.params.logit_bias[tok] = -INFINITY; +// // } +// // } +// // } +// // } +// // } + +// llama.params.antiprompt.clear(); +// for (const std::string& stopPrompt : predict->stopprompts()) { +// if (!stopPrompt.empty()) +// { +// llama.params.antiprompt.push_back(stopPrompt); +// } +// } +// } + +static void params_parse(const backend::ModelOptions* request, + gpt_params & params, llama_server_context &llama) { + + // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809 + + params.model = request->modelfile(); + if (!request->mmproj().empty()) { + // get the directory of modelfile + std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\")); + params.mmproj.path = model_dir + "/"+ request->mmproj(); + } + // params.model_alias ?? + params.model_alias = request->modelfile(); + if (!request->cachetypekey().empty()) { + params.cache_type_k = request->cachetypekey(); + } + if (!request->cachetypevalue().empty()) { + params.cache_type_v = request->cachetypevalue(); + } + params.n_ctx = request->contextsize(); + //params.memory_f16 = request->f16memory(); + params.n_threads = request->threads(); + params.n_gpu_layers = request->ngpulayers(); + params.n_batch = request->nbatch(); + // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1 + //params.n_parallel = 1; + const char *env_parallel = std::getenv("LLAMACPP_PARALLEL"); + if (env_parallel != NULL) { + params.n_parallel = std::stoi(env_parallel); + params.cont_batching = true; + } else { + params.n_parallel = 1; + } + + const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS"); + if (llama_grpc_servers != NULL) { + add_rpc_devices(std::string(llama_grpc_servers)); + } + + // decode options. Options are in form optname:optvale, or if booleans only optname. + for (int i = 0; i < request->options_size(); i++) { + std::string opt = request->options(i); + char *optname = strtok(&opt[0], ":"); + char *optval = strtok(NULL, ":"); + if (optval == NULL) { + optval = "true"; + } + + if (!strcmp(optname, "gpu")) { + llama.has_gpu = true; + } + } + + // TODO: Add yarn + + if (!request->tensorsplit().empty()) { + std::string arg_next = request->tensorsplit(); + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + + GGML_ASSERT(split_arg.size() <= llama_max_devices()); + + for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) { + if (i_device < split_arg.size()) { + params.tensor_split[i_device] = std::stof(split_arg[i_device]); + } + else { + params.tensor_split[i_device] = 0.0f; + } + } + } + + if (!request->maingpu().empty()) { + params.main_gpu = std::stoi(request->maingpu()); + } + if (!request->loraadapter().empty() && !request->lorabase().empty()) { + float scale_factor = 1.0f; + if (request->lorascale() != 0.0f) { + scale_factor = request->lorascale(); + } + // get the directory of modelfile + std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\")); + params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor }); + } + params.use_mlock = request->mlock(); + params.use_mmap = request->mmap(); + params.flash_attn = (request->flashattention() == "true" || request->flashattention() == "1"); + params.no_kv_offload = request->nokvoffload(); + params.ctx_shift = false; // We control context-shifting in any case (and we disable it as it could just lead to infinite loops) + + params.embedding = request->embeddings(); + + if (request->ropescaling() == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (request->ropescaling() == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + if ( request->yarnextfactor() != 0.0f ) { + params.yarn_ext_factor = request->yarnextfactor(); + } + if ( request->yarnattnfactor() != 0.0f ) { + params.yarn_attn_factor = request->yarnattnfactor(); + } + if ( request->yarnbetafast() != 0.0f ) { + params.yarn_beta_fast = request->yarnbetafast(); + } + if ( request->yarnbetaslow() != 0.0f ) { + params.yarn_beta_slow = request->yarnbetaslow(); + } + if ( request->ropefreqbase() != 0.0f ) { + params.rope_freq_base = request->ropefreqbase(); + } + if ( request->ropefreqscale() != 0.0f ) { + params.rope_freq_scale = request->ropefreqscale(); + } + + if (request->grammartriggers_size() > 0) { + LOG_INFO("configuring grammar triggers", {}); + llama.grammar_lazy = true; + for (int i = 0; i < request->grammartriggers_size(); i++) { + common_grammar_trigger trigger; + trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_WORD; + trigger.value = request->grammartriggers(i).word(); + // trigger.at_start = request->grammartriggers(i).at_start(); + llama.grammar_triggers.push_back(trigger); + LOG_INFO("grammar trigger", { + { "word", trigger.value }, + }); + } + } +} + + +// GRPC Server start +class BackendServiceImpl final : public backend::Backend::Service { +public: + grpc::Status Health(ServerContext* context, const backend::HealthMessage* request, backend::Reply* reply) { + // Implement Health RPC + reply->set_message("OK"); + return Status::OK; + } + + grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) { + // Implement LoadModel RPC + gpt_params params; + params_parse(request, params, llama); + + llama_backend_init(); + llama_numa_init(params.numa); + + // load the model + if (!llama.load_model(params)) + { + result->set_message("Failed loading model"); + result->set_success(false); + return Status::CANCELLED; + } + llama.initialize(); + result->set_message("Loading succeeded"); + result->set_success(true); + loaded_model = true; + return Status::OK; + } + grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter* writer) override { + json data = parse_options(true, request, llama); + const int task_id = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(task_id); + llama.request_completion(task_id, data, false, false, -1); + while (true) + { + task_result result = llama.queue_results.recv(task_id); + if (!result.error) { + const std::string str = + "data: " + + result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", { + { "to_send", str } + }); + + backend::Reply reply; + // print it + std::string completion_text = result.result_json.value("content", ""); + + reply.set_message(completion_text); + int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0); + reply.set_tokens(tokens_predicted); + int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0); + reply.set_prompt_tokens(tokens_evaluated); + + if (result.result_json.contains("timings")) { + double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0); + reply.set_timing_prompt_processing(timing_prompt_processing); + double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0); + reply.set_timing_token_generation(timing_token_generation); + } + + // Log Request Correlation Id + LOG_VERBOSE("correlation:", { + { "id", data["correlation_id"] } + }); + + // Send the reply + writer->Write(reply); + + if (result.stop) { + break; + } + } else { + break; + } + } + + return grpc::Status::OK; + } + + + grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) { + json data = parse_options(false, request, llama); + const int task_id = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(task_id); + llama.request_completion(task_id, data, false, false, -1); + std::string completion_text; + task_result result = llama.queue_results.recv(task_id); + if (!result.error && result.stop) { + + // Log Request Correlation Id + LOG_VERBOSE("correlation:", { + { "id", data["correlation_id"] } + }); + + completion_text = result.result_json.value("content", ""); + int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0); + int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0); + reply->set_prompt_tokens(tokens_evaluated); + reply->set_tokens(tokens_predicted); + reply->set_message(completion_text); + + if (result.result_json.contains("timings")) { + double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0); + reply->set_timing_prompt_processing(timing_prompt_processing); + double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0); + reply->set_timing_token_generation(timing_token_generation); + } + } + else + { + return grpc::Status::OK; + } + + return grpc::Status::OK; + } + + /// https://github.com/ggerganov/llama.cpp/blob/aa2341298924ac89778252015efcb792f2df1e20/examples/server/server.cpp#L2969 + grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) { + json data = parse_options(false, request, llama); + const int task_id = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(task_id); + llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1); + // get the result + task_result result = llama.queue_results.recv(task_id); + //std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl; + llama.queue_results.remove_waiting_task_id(task_id); + if (!result.error && result.stop) { + std::vector embeddings = result.result_json.value("embedding", std::vector()); + // loop the vector and set the embeddings results + for (int i = 0; i < embeddings.size(); i++) { + embeddingResult->add_embeddings(embeddings[i]); + } + } + else + { + return grpc::Status::OK; + } + + return grpc::Status::OK; + } + + grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){ + json data = parse_options(false, request, llama); + + std::vector tokens = llama.tokenize(data["prompt"],false); + + for (int i=0 ; i< tokens.size(); i++){ + response->add_tokens(tokens[i]); + } + + return grpc::Status::OK; + } + + grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) { + llama_client_slot* active_slot = llama.get_active_slot(); + + if (active_slot != nullptr) { + // Calculate the tokens per second using existing logic + double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded; + + // Populate the response with metrics + response->set_slot_id(active_slot->id); + response->set_prompt_json_for_slot(active_slot->prompt.dump()); + response->set_tokens_per_second(tokens_per_second); + response->set_tokens_generated(active_slot->n_decoded); + response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed); + } else { + // Handle case when no active slot exists + response->set_slot_id(0); + response->set_prompt_json_for_slot(""); + response->set_tokens_per_second(0); + response->set_tokens_generated(0); + response->set_prompt_tokens_processed(0); + } + + return grpc::Status::OK; + } +}; + +void RunServer(const std::string& server_address) { + BackendServiceImpl service; + + ServerBuilder builder; + builder.AddListeningPort(server_address, grpc::InsecureServerCredentials()); + builder.RegisterService(&service); + + std::unique_ptr server(builder.BuildAndStart()); + std::cout << "Server listening on " << server_address << std::endl; + server->Wait(); +} + +int main(int argc, char** argv) { + std::string server_address("localhost:50051"); + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = signal_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); + sigaction(SIGTERM, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + + // Define long and short options + struct option long_options[] = { + {"addr", required_argument, nullptr, 'a'}, + {nullptr, 0, nullptr, 0} + }; + + // Parse command-line arguments + int option; + int option_index = 0; + while ((option = getopt_long(argc, argv, "a:", long_options, &option_index)) != -1) { + switch (option) { + case 'a': + server_address = optarg; + break; + default: + std::cerr << "Usage: " << argv[0] << " [--addr=
] or [-a
]" << std::endl; + return 1; + } + } + + // run the HTTP server in a thread - see comment below + std::thread t([&]() + { + RunServer(server_address); + return 0; + }); + + + //); + start_llama_server(); + std::cout << "stopping" << std::endl; + + t.join(); + + llama_backend_free(); + return 0; +} diff --git a/backend/cpp/ik-llama-cpp/package.sh b/backend/cpp/ik-llama-cpp/package.sh new file mode 100644 index 000000000..56d430563 --- /dev/null +++ b/backend/cpp/ik-llama-cpp/package.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Script to copy the appropriate libraries based on architecture +# This script is used in the final stage of the Dockerfile + +set -e + +CURDIR=$(dirname "$(realpath $0)") +REPO_ROOT="${CURDIR}/../../.." + +# Create lib directory +mkdir -p $CURDIR/package/lib + +cp -avrf $CURDIR/ik-llama-cpp-* $CURDIR/package/ +cp -rfv $CURDIR/run.sh $CURDIR/package/ + +# Detect architecture and copy appropriate libraries +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + # x86_64 architecture + echo "Detected x86_64 architecture, copying x86_64 libraries..." + cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so + cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + # ARM64 architecture + echo "Detected ARM64 architecture, copying ARM64 libraries..." + cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so + cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 +else + echo "Error: Could not detect architecture" + exit 1 +fi + +# Package GPU libraries based on BUILD_TYPE +# The GPU library packaging script will detect BUILD_TYPE and copy appropriate GPU libraries +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..." + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "Packaging completed successfully" +ls -liah $CURDIR/package/ +ls -liah $CURDIR/package/lib/ diff --git a/backend/cpp/ik-llama-cpp/patches/0001-fix-missing-cstdint-include.patch b/backend/cpp/ik-llama-cpp/patches/0001-fix-missing-cstdint-include.patch new file mode 100644 index 000000000..ef5710fc4 --- /dev/null +++ b/backend/cpp/ik-llama-cpp/patches/0001-fix-missing-cstdint-include.patch @@ -0,0 +1,10 @@ +--- a/ggml/src/iqk/iqk_common.h ++++ b/ggml/src/iqk/iqk_common.h +@@ -9,6 +9,7 @@ + #pragma once + + #include "iqk_config.h" ++#include + + #if defined IQK_IMPLEMENT + diff --git a/backend/cpp/ik-llama-cpp/prepare.sh b/backend/cpp/ik-llama-cpp/prepare.sh new file mode 100644 index 000000000..fb0ba7624 --- /dev/null +++ b/backend/cpp/ik-llama-cpp/prepare.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +## Patches + +## Apply patches from the `patches` directory +if [ -d "patches" ]; then + for patch in $(ls patches); do + echo "Applying patch $patch" + patch -d llama.cpp/ -p1 < patches/$patch + done +fi + +set -e + +cp -r CMakeLists.txt llama.cpp/examples/grpc-server/ +cp -r grpc-server.cpp llama.cpp/examples/grpc-server/ +cp -r utils.hpp llama.cpp/examples/grpc-server/ +cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/examples/grpc-server/ + +## Copy clip/llava files for multimodal support (built as myclip library) +cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h +cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp +cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp +# Prepend llama.h include to llava.h +echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h +cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h +# Copy clip-impl.h if it exists +if [ -f llama.cpp/examples/llava/clip-impl.h ]; then + cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h +fi +# Copy stb_image.h +if [ -f llama.cpp/vendor/stb/stb_image.h ]; then + cp -rfv llama.cpp/vendor/stb/stb_image.h llama.cpp/examples/grpc-server/stb_image.h +elif [ -f llama.cpp/common/stb_image.h ]; then + cp -rfv llama.cpp/common/stb_image.h llama.cpp/examples/grpc-server/stb_image.h +fi + +## Fix API compatibility in llava.cpp (llama_n_embd -> llama_model_n_embd) +if [ -f llama.cpp/examples/grpc-server/llava.cpp ]; then + sed -i 's/llama_n_embd(/llama_model_n_embd(/g' llama.cpp/examples/grpc-server/llava.cpp +fi + +set +e +if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then + echo "grpc-server already added" +else + echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt +fi +set -e diff --git a/backend/cpp/ik-llama-cpp/run.sh b/backend/cpp/ik-llama-cpp/run.sh new file mode 100644 index 000000000..1c4ee2a69 --- /dev/null +++ b/backend/cpp/ik-llama-cpp/run.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -ex + +# Get the absolute current dir where the script is located +CURDIR=$(dirname "$(realpath $0)") + +cd / + +echo "CPU info:" +grep -e "model\sname" /proc/cpuinfo | head -1 +grep -e "flags" /proc/cpuinfo | head -1 + +# ik_llama.cpp requires AVX2 — default to avx2 binary +BINARY=ik-llama-cpp-avx2 + +if [ -e $CURDIR/ik-llama-cpp-fallback ] && ! grep -q -e "\savx2\s" /proc/cpuinfo ; then + echo "CPU: AVX2 NOT found, using fallback" + BINARY=ik-llama-cpp-fallback +fi + +# Extend ld library path with the dir where this script is located/lib +if [ "$(uname)" == "Darwin" ]; then + export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH + #export DYLD_FALLBACK_LIBRARY_PATH=$CURDIR/lib:$DYLD_FALLBACK_LIBRARY_PATH +else + export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH +fi + +# If there is a lib/ld.so, use it +if [ -f $CURDIR/lib/ld.so ]; then + echo "Using lib/ld.so" + echo "Using binary: $BINARY" + exec $CURDIR/lib/ld.so $CURDIR/$BINARY "$@" +fi + +echo "Using binary: $BINARY" +exec $CURDIR/$BINARY "$@" + +# We should never reach this point, however just in case we do, run fallback +exec $CURDIR/ik-llama-cpp-fallback "$@" diff --git a/backend/cpp/ik-llama-cpp/utils.hpp b/backend/cpp/ik-llama-cpp/utils.hpp new file mode 100644 index 000000000..e5cf2a009 --- /dev/null +++ b/backend/cpp/ik-llama-cpp/utils.hpp @@ -0,0 +1,483 @@ +// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "json.hpp" + +#include "clip.h" + +using json = nlohmann::json; + +extern bool server_verbose; + +#ifndef SERVER_VERBOSE +#define SERVER_VERBOSE 1 +#endif + +#if SERVER_VERBOSE != 1 +#define LOG_VERBOSE(MSG, ...) +#else +#define LOG_VERBOSE(MSG, ...) \ + do \ + { \ + if (server_verbose) \ + { \ + server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ + } \ + } while (0) +#endif + +#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) + +// +// parallel +// + +enum server_state { + SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet + SERVER_STATE_READY, // Server is ready and model is loaded + SERVER_STATE_ERROR // An error occurred, load_model failed +}; + +enum task_type { + TASK_TYPE_COMPLETION, + TASK_TYPE_CANCEL, + TASK_TYPE_NEXT_RESPONSE +}; + +struct task_server { + int id = -1; // to be filled by llama_server_queue + int target_id; + task_type type; + json data; + bool infill_mode = false; + bool embedding_mode = false; + int multitask_id = -1; +}; + +struct task_result { + int id; + int multitask_id = -1; + bool stop; + bool error; + json result_json; +}; + +struct task_multi { + int id; + std::set subtasks_remaining{}; + std::vector results{}; +}; + +// TODO: can become bool if we can't find use of more states +enum slot_state +{ + IDLE, + PROCESSING, +}; + +enum slot_command +{ + NONE, + LOAD_PROMPT, + RELEASE, +}; + +struct slot_params +{ + bool stream = true; + bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt + + uint32_t seed = -1; // RNG seed + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_predict = -1; // new tokens to predict + + std::vector antiprompt; + + json input_prefix; + json input_suffix; +}; + +struct slot_image +{ + int32_t id; + + bool request_encode_image = false; + float * image_embedding = nullptr; + int32_t image_tokens = 0; + + clip_image_u8 * img_data; + + std::string prefix_prompt; // before of this image +}; + +// completion token output with probabilities +struct completion_token_output +{ + struct token_prob + { + llama_token tok; + float prob; + }; + + std::vector probs; + llama_token tok; + std::string text_to_send; +}; + +static inline void server_log(const char *level, const char *function, int line, + const char *message, const nlohmann::ordered_json &extra) +{ + nlohmann::ordered_json log + { + {"timestamp", time(nullptr)}, + {"level", level}, + {"function", function}, + {"line", line}, + {"message", message}, + }; + + if (!extra.empty()) + { + log.merge_patch(extra); + } + + const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); + printf("%.*s\n", (int)str.size(), str.data()); + fflush(stdout); +} + +// +// server utils +// + +template +static T json_value(const json &body, const std::string &key, const T &default_value) +{ + // Fallback null to default value + return body.contains(key) && !body.at(key).is_null() + ? body.value(key, default_value) + : default_value; +} + +inline std::string format_chatml(std::vector messages) +{ + std::ostringstream chatml_msgs; + + for (auto it = messages.begin(); it != messages.end(); ++it) { + chatml_msgs << "<|im_start|>" + << json_value(*it, "role", std::string("user")) << '\n'; + chatml_msgs << json_value(*it, "content", std::string("")) + << "<|im_end|>\n"; + } + + chatml_msgs << "<|im_start|>assistant" << '\n'; + + return chatml_msgs.str(); +} + +// +// work queue utils +// + +struct llama_server_queue { + int id = 0; + std::mutex mutex_tasks; + // queues + std::vector queue_tasks; + std::vector queue_tasks_deferred; + std::vector queue_multitasks; + std::condition_variable condition_tasks; + // callback functions + std::function callback_new_task; + std::function callback_finish_multitask; + std::function callback_all_task_finished; + + // Add a new task to the end of the queue + int post(task_server task) { + std::unique_lock lock(mutex_tasks); + if (task.id == -1) { + task.id = id++; + } + queue_tasks.push_back(std::move(task)); + condition_tasks.notify_one(); + return task.id; + } + + // Add a new task, but defer until one slot is available + void defer(task_server task) { + std::unique_lock lock(mutex_tasks); + queue_tasks_deferred.push_back(std::move(task)); + } + + // Get the next id for creating anew task + int get_new_id() { + std::unique_lock lock(mutex_tasks); + return id++; + } + + // Register function to process a new task + void on_new_task(std::function callback) { + callback_new_task = callback; + } + + // Register function to process a multitask + void on_finish_multitask(std::function callback) { + callback_finish_multitask = callback; + } + + // Register the function to be called when the batch of tasks is finished + void on_all_tasks_finished(std::function callback) { + callback_all_task_finished = callback; + } + + // Call when the state of one slot is changed + void notify_slot_changed() { + // move deferred tasks back to main loop + std::unique_lock lock(mutex_tasks); + for (auto & task : queue_tasks_deferred) { + queue_tasks.push_back(std::move(task)); + } + queue_tasks_deferred.clear(); + } + + // Start the main loop. This call is blocking + [[noreturn]] + void start_loop() { + while (true) { + // new task arrived + LOG_VERBOSE("have new task", {}); + { + while (true) + { + std::unique_lock lock(mutex_tasks); + if (queue_tasks.empty()) { + lock.unlock(); + break; + } + task_server task = queue_tasks.front(); + queue_tasks.erase(queue_tasks.begin()); + lock.unlock(); + LOG_VERBOSE("callback_new_task", {}); + callback_new_task(task); + } + LOG_VERBOSE("callback_all_task_finished", {}); + // process and update all the multitasks + auto queue_iterator = queue_multitasks.begin(); + while (queue_iterator != queue_multitasks.end()) + { + if (queue_iterator->subtasks_remaining.empty()) + { + // all subtasks done == multitask is done + task_multi current_multitask = *queue_iterator; + callback_finish_multitask(current_multitask); + // remove this multitask + queue_iterator = queue_multitasks.erase(queue_iterator); + } + else + { + ++queue_iterator; + } + } + // all tasks in the current loop is finished + callback_all_task_finished(); + } + LOG_VERBOSE("wait for new task", {}); + // wait for new task + { + std::unique_lock lock(mutex_tasks); + if (queue_tasks.empty()) { + condition_tasks.wait(lock, [&]{ + return !queue_tasks.empty(); + }); + } + } + } + } + + // + // functions to manage multitasks + // + + // add a multitask by specifying the id of all subtask (subtask is a task_server) + void add_multitask(int multitask_id, std::vector& sub_ids) + { + std::lock_guard lock(mutex_tasks); + task_multi multi; + multi.id = multitask_id; + std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end())); + queue_multitasks.push_back(multi); + } + + // updatethe remaining subtasks, while appending results to multitask + void update_multitask(int multitask_id, int subtask_id, task_result& result) + { + std::lock_guard lock(mutex_tasks); + for (auto& multitask : queue_multitasks) + { + if (multitask.id == multitask_id) + { + multitask.subtasks_remaining.erase(subtask_id); + multitask.results.push_back(result); + } + } + } +}; + +struct llama_server_response { + typedef std::function callback_multitask_t; + callback_multitask_t callback_update_multitask; + // for keeping track of all tasks waiting for the result + std::set waiting_task_ids; + // the main result queue + std::vector queue_results; + std::mutex mutex_results; + std::condition_variable condition_results; + + void add_waiting_task_id(int task_id) { + std::unique_lock lock(mutex_results); + waiting_task_ids.insert(task_id); + } + + void remove_waiting_task_id(int task_id) { + std::unique_lock lock(mutex_results); + waiting_task_ids.erase(task_id); + } + + // This function blocks the thread until there is a response for this task_id + task_result recv(int task_id) { + while (true) + { + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&]{ + return !queue_results.empty(); + }); + LOG_VERBOSE("condition_results unblock", {}); + + for (int i = 0; i < (int) queue_results.size(); i++) + { + if (queue_results[i].id == task_id) + { + assert(queue_results[i].multitask_id == -1); + task_result res = queue_results[i]; + queue_results.erase(queue_results.begin() + i); + return res; + } + } + } + + // should never reach here + } + + // Register the function to update multitask + void on_multitask_update(callback_multitask_t callback) { + callback_update_multitask = callback; + } + + // Send a new result to a waiting task_id + void send(task_result result) { + std::unique_lock lock(mutex_results); + LOG_VERBOSE("send new result", {}); + for (auto& task_id : waiting_task_ids) { + // LOG_TEE("waiting task id %i \n", task_id); + // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result + if (result.multitask_id == task_id) + { + LOG_VERBOSE("callback_update_multitask", {}); + callback_update_multitask(task_id, result.id, result); + continue; + } + + if (result.id == task_id) + { + LOG_VERBOSE("queue_results.push_back", {}); + queue_results.push_back(result); + condition_results.notify_one(); + return; + } + } + } +}; + +// +// base64 utils (TODO: move to common in the future) +// + +static const std::string base64_chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + +static inline bool is_base64(uint8_t c) +{ + return (isalnum(c) || (c == '+') || (c == '/')); +} + +static inline std::vector base64_decode(const std::string & encoded_string) +{ + int i = 0; + int j = 0; + int in_ = 0; + + int in_len = encoded_string.size(); + + uint8_t char_array_4[4]; + uint8_t char_array_3[3]; + + std::vector ret; + + while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) + { + char_array_4[i++] = encoded_string[in_]; in_++; + if (i == 4) + { + for (i = 0; i <4; i++) + { + char_array_4[i] = base64_chars.find(char_array_4[i]); + } + + char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (i = 0; (i < 3); i++) + { + ret.push_back(char_array_3[i]); + } + i = 0; + } + } + + if (i) + { + for (j = i; j <4; j++) + { + char_array_4[j] = 0; + } + + for (j = 0; j <4; j++) + { + char_array_4[j] = base64_chars.find(char_array_4[j]); + } + + char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (j = 0; (j < i - 1); j++) + { + ret.push_back(char_array_3[j]); + } + } + + return ret; +} \ No newline at end of file diff --git a/backend/index.yaml b/backend/index.yaml index 1546c1af3..a1f5688a8 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -29,6 +29,20 @@ nvidia-cuda-12: "cuda12-llama-cpp" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp" +- &ikllamacpp + name: "ik-llama-cpp" + alias: "ik-llama-cpp" + license: mit + description: | + Fork of llama.cpp optimized for CPU performance by ikawrakow + urls: + - https://github.com/ikawrakow/ik_llama.cpp + tags: + - text-to-text + - LLM + - CPU + capabilities: + default: "cpu-ik-llama-cpp" - &whispercpp name: "whisper" alias: "whisper" @@ -897,6 +911,10 @@ nvidia-cuda-12: "cuda12-llama-cpp-development" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-development" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-development" +- !!merge <<: *ikllamacpp + name: "ik-llama-cpp-development" + capabilities: + default: "cpu-ik-llama-cpp-development" - !!merge <<: *neutts name: "cpu-neutts" uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-neutts" @@ -1327,6 +1345,17 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp +## ik-llama-cpp +- !!merge <<: *ikllamacpp + name: "cpu-ik-llama-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ik-llama-cpp" + mirrors: + - localai/localai-backends:latest-cpu-ik-llama-cpp +- !!merge <<: *ikllamacpp + name: "cpu-ik-llama-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ik-llama-cpp" + mirrors: + - localai/localai-backends:master-cpu-ik-llama-cpp ## whisper - !!merge <<: *whispercpp name: "nvidia-l4t-arm64-whisper" diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md index aa8608ec3..bfcbf650a 100644 --- a/docs/content/features/text-generation.md +++ b/docs/content/features/text-generation.md @@ -539,6 +539,47 @@ options: - [llama](https://github.com/ggerganov/llama.cpp) +### ik_llama.cpp + +[ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp) is a hard fork of `llama.cpp` by Iwan Kawrakow that focuses on superior CPU and hybrid GPU/CPU performance. It ships additional quantization types (IQK quants), custom quantization mixes, Multi-head Latent Attention (MLA) for DeepSeek models, and fine-grained tensor offload controls — particularly useful for running very large models on commodity CPU hardware. + +{{% notice note %}} + +The `ik-llama-cpp` backend requires a CPU with **AVX2** support. The IQK kernels are not compatible with older CPUs. + +{{% /notice %}} + +#### Features + +The `ik-llama-cpp` backend supports the following features: +- [📖 Text generation (GPT)]({{%relref "features/text-generation" %}}) +- [🧠 Embeddings]({{%relref "features/embeddings" %}}) +- IQK quantization types for better CPU inference performance +- Multimodal models (via clip/llava) + +#### Setup + +The backend is distributed as a separate container image and can be installed from the LocalAI backend gallery, or specified directly in a model configuration. GGUF models loaded with this backend benefit from ik_llama.cpp's optimized CPU kernels — especially useful for MoE models and large quantized models that would otherwise be GPU-bound. + +#### YAML configuration + +To use the `ik-llama-cpp` backend, specify it as the backend in the YAML file: + +```yaml +name: my-model +backend: ik-llama-cpp +parameters: + # Relative to the models path + model: file.gguf +``` + +The aliases `ik-llama` and `ik_llama` are also accepted. + +#### Reference + +- [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp) + + ### vLLM [vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference. diff --git a/docs/content/reference/compatibility-table.md b/docs/content/reference/compatibility-table.md index 80cf4e781..5a2ce0cf2 100644 --- a/docs/content/reference/compatibility-table.md +++ b/docs/content/reference/compatibility-table.md @@ -19,6 +19,7 @@ LocalAI will attempt to automatically load models which are not explicitly confi | Backend | Description | Capability | Embeddings | Streaming | Acceleration | |---------|-------------|------------|------------|-----------|-------------| | [llama.cpp](https://github.com/ggerganov/llama.cpp) | LLM inference in C/C++. Supports LLaMA, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | GPT, Functions | yes | yes | CPU, CUDA 12/13, ROCm, Intel SYCL, Vulkan, Metal, Jetson L4T | +| [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp) | Hard fork of llama.cpp optimized for CPU/hybrid CPU+GPU with IQK quants, custom quant mixes, and MLA for DeepSeek | GPT | yes | yes | CPU (AVX2+) | | [vLLM](https://github.com/vllm-project/vllm) | Fast LLM serving with PagedAttention | GPT | no | no | CUDA 12, ROCm, Intel | | [vLLM Omni](https://github.com/vllm-project/vllm) | Unified multimodal generation (text, image, video, audio) | Multimodal GPT | no | no | CUDA 12, ROCm | | [transformers](https://github.com/huggingface/transformers) | HuggingFace Transformers framework | GPT, Embeddings, Multimodal | yes | yes* | CPU, CUDA 12/13, ROCm, Intel, Metal | diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index 2c75a2245..a3de78dd4 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -14,12 +14,15 @@ import ( ) const ( - LLamaCPP = "llama-cpp" + LLamaCPP = "llama-cpp" + IKLLamaCPP = "ik-llama-cpp" ) var Aliases = map[string]string{ "go-llama": LLamaCPP, "llama": LLamaCPP, + "ik_llama": IKLLamaCPP, + "ik-llama": IKLLamaCPP, "embedded-store": LocalStoreBackend, "huggingface-embeddings": TransformersBackend, "transformers-musicgen": TransformersBackend, diff --git a/scripts/changed-backends.js b/scripts/changed-backends.js index da2486300..4ef8b1874 100644 --- a/scripts/changed-backends.js +++ b/scripts/changed-backends.js @@ -24,6 +24,9 @@ function inferBackendPath(item) { if (item.dockerfile.endsWith("rust")) { return `backend/rust/${item.backend}/`; } + if (item.dockerfile.endsWith("ik-llama-cpp")) { + return `backend/cpp/ik-llama-cpp/`; + } if (item.dockerfile.endsWith("llama-cpp")) { return `backend/cpp/llama-cpp/`; } diff --git a/tests/e2e-backends/backend_test.go b/tests/e2e-backends/backend_test.go new file mode 100644 index 000000000..a800a7ab5 --- /dev/null +++ b/tests/e2e-backends/backend_test.go @@ -0,0 +1,342 @@ +package e2ebackends_test + +import ( + "context" + "fmt" + "io" + "net" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/phayes/freeport" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" +) + +// Environment variables consumed by the suite. +// +// Required (one of): +// +// BACKEND_IMAGE Docker image tag to test (e.g. local-ai-backend:llama-cpp). +// +// Required model source (one of): +// +// BACKEND_TEST_MODEL_URL HTTP(S) URL of a model file to download before the test. +// BACKEND_TEST_MODEL_FILE Path to an already-available model file (skips download). +// +// Optional: +// +// BACKEND_TEST_CAPS Comma-separated list of capabilities to exercise. +// Supported values: health, load, predict, stream, embeddings. +// Defaults to "health,load,predict,stream". +// A backend that only does embeddings would set this to +// "health,load,embeddings"; an image/TTS backend that cannot +// be driven by a text prompt can set it to "health,load". +// BACKEND_TEST_PROMPT Override the prompt used by predict/stream specs. +// BACKEND_TEST_CTX_SIZE Override the context size passed to LoadModel (default 512). +// BACKEND_TEST_THREADS Override Threads passed to LoadModel (default 4). +// +// The suite is intentionally model-format-agnostic: it only ever passes the +// file path to LoadModel, so GGUF, ONNX, safetensors, .bin etc. all work so +// long as the backend under test accepts that format. +const ( + capHealth = "health" + capLoad = "load" + capPredict = "predict" + capStream = "stream" + capEmbeddings = "embeddings" + + defaultPrompt = "The capital of France is" + streamPrompt = "Once upon a time" +) + +func defaultCaps() map[string]bool { + return map[string]bool{ + capHealth: true, + capLoad: true, + capPredict: true, + capStream: true, + } +} + +// parseCaps reads BACKEND_TEST_CAPS and returns the enabled capability set. +// An empty/unset value falls back to defaultCaps(). +func parseCaps() map[string]bool { + raw := strings.TrimSpace(os.Getenv("BACKEND_TEST_CAPS")) + if raw == "" { + return defaultCaps() + } + caps := map[string]bool{} + for _, part := range strings.Split(raw, ",") { + part = strings.TrimSpace(strings.ToLower(part)) + if part != "" { + caps[part] = true + } + } + return caps +} + +var _ = Describe("Backend container", Ordered, func() { + var ( + caps map[string]bool + workDir string + binaryDir string + modelFile string + addr string + serverCmd *exec.Cmd + conn *grpc.ClientConn + client pb.BackendClient + prompt string + ) + + BeforeAll(func() { + image := os.Getenv("BACKEND_IMAGE") + Expect(image).NotTo(BeEmpty(), "BACKEND_IMAGE env var must be set (e.g. local-ai-backend:llama-cpp)") + + modelURL := os.Getenv("BACKEND_TEST_MODEL_URL") + modelFile = os.Getenv("BACKEND_TEST_MODEL_FILE") + Expect(modelURL != "" || modelFile != "").To(BeTrue(), + "one of BACKEND_TEST_MODEL_URL or BACKEND_TEST_MODEL_FILE must be set") + + caps = parseCaps() + GinkgoWriter.Printf("Testing image=%q with capabilities=%v\n", image, keys(caps)) + + prompt = os.Getenv("BACKEND_TEST_PROMPT") + if prompt == "" { + prompt = defaultPrompt + } + + var err error + workDir, err = os.MkdirTemp("", "backend-e2e-*") + Expect(err).NotTo(HaveOccurred()) + + // Extract the image filesystem so we can run run.sh directly. + binaryDir = filepath.Join(workDir, "rootfs") + Expect(os.MkdirAll(binaryDir, 0o755)).To(Succeed()) + extractImage(image, binaryDir) + Expect(filepath.Join(binaryDir, "run.sh")).To(BeAnExistingFile()) + + // Download the model once if not provided. + if modelFile == "" { + modelFile = filepath.Join(workDir, "model.bin") + downloadFile(modelURL, modelFile) + } + + // Pick a free port and launch the backend. + port, err := freeport.GetFreePort() + Expect(err).NotTo(HaveOccurred()) + addr = fmt.Sprintf("127.0.0.1:%d", port) + + Expect(os.Chmod(filepath.Join(binaryDir, "run.sh"), 0o755)).To(Succeed()) + // Mark any other top-level files executable (extraction may strip perms). + entries, _ := os.ReadDir(binaryDir) + for _, e := range entries { + if !e.IsDir() && !strings.HasSuffix(e.Name(), ".sh") { + _ = os.Chmod(filepath.Join(binaryDir, e.Name()), 0o755) + } + } + + serverCmd = exec.Command(filepath.Join(binaryDir, "run.sh"), "--addr="+addr) + serverCmd.Stdout = GinkgoWriter + serverCmd.Stderr = GinkgoWriter + Expect(serverCmd.Start()).To(Succeed()) + + // Wait for the gRPC port to accept connections. + Eventually(func() error { + c, err := net.DialTimeout("tcp", addr, 500*time.Millisecond) + if err != nil { + return err + } + _ = c.Close() + return nil + }, 30*time.Second, 200*time.Millisecond).Should(Succeed(), "backend did not start") + + conn, err = grpc.Dial(addr, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(50*1024*1024)), + ) + Expect(err).NotTo(HaveOccurred()) + client = pb.NewBackendClient(conn) + }) + + AfterAll(func() { + if conn != nil { + _ = conn.Close() + } + if serverCmd != nil && serverCmd.Process != nil { + _ = serverCmd.Process.Kill() + _, _ = serverCmd.Process.Wait() + } + if workDir != "" { + _ = os.RemoveAll(workDir) + } + }) + + It("responds to Health", func() { + if !caps[capHealth] { + Skip("health capability not enabled") + } + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + res, err := client.Health(ctx, &pb.HealthMessage{}) + Expect(err).NotTo(HaveOccurred()) + Expect(res.GetMessage()).NotTo(BeEmpty()) + }) + + It("loads the model", func() { + if !caps[capLoad] { + Skip("load capability not enabled") + } + ctxSize := envInt32("BACKEND_TEST_CTX_SIZE", 512) + threads := envInt32("BACKEND_TEST_THREADS", 4) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + res, err := client.LoadModel(ctx, &pb.ModelOptions{ + Model: modelFile, + ModelFile: modelFile, + ContextSize: ctxSize, + Threads: threads, + NGPULayers: 0, + MMap: true, + NBatch: 128, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(res.GetSuccess()).To(BeTrue(), "LoadModel failed: %s", res.GetMessage()) + }) + + It("generates output via Predict", func() { + if !caps[capPredict] { + Skip("predict capability not enabled") + } + ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) + defer cancel() + res, err := client.Predict(ctx, &pb.PredictOptions{ + Prompt: prompt, + Tokens: 20, + Temperature: 0.1, + TopK: 40, + TopP: 0.9, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(res.GetMessage()).NotTo(BeEmpty(), "Predict produced empty output") + GinkgoWriter.Printf("Predict: %q (tokens=%d, prompt_tokens=%d)\n", + res.GetMessage(), res.GetTokens(), res.GetPromptTokens()) + }) + + It("streams output via PredictStream", func() { + if !caps[capStream] { + Skip("stream capability not enabled") + } + ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) + defer cancel() + stream, err := client.PredictStream(ctx, &pb.PredictOptions{ + Prompt: streamPrompt, + Tokens: 20, + Temperature: 0.1, + TopK: 40, + TopP: 0.9, + }) + Expect(err).NotTo(HaveOccurred()) + + var chunks int + var combined string + for { + msg, err := stream.Recv() + if err == io.EOF { + break + } + Expect(err).NotTo(HaveOccurred()) + if len(msg.GetMessage()) > 0 { + chunks++ + combined += string(msg.GetMessage()) + } + } + Expect(chunks).To(BeNumerically(">", 0), "no stream chunks received") + GinkgoWriter.Printf("Stream: %d chunks, combined=%q\n", chunks, combined) + }) + + It("computes embeddings via Embedding", func() { + if !caps[capEmbeddings] { + Skip("embeddings capability not enabled") + } + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + res, err := client.Embedding(ctx, &pb.PredictOptions{ + Embeddings: prompt, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(res.GetEmbeddings()).NotTo(BeEmpty(), "Embedding returned empty vector") + GinkgoWriter.Printf("Embedding: %d dims\n", len(res.GetEmbeddings())) + }) +}) + +// extractImage runs `docker create` + `docker export` to materialise the image +// rootfs into dest. Using export (not save) avoids dealing with layer tarballs. +func extractImage(image, dest string) { + GinkgoHelper() + // The backend images have no default ENTRYPOINT/CMD, so docker create fails + // unless we override one; run.sh is harmless and guaranteed to exist. + create := exec.Command("docker", "create", "--entrypoint=/run.sh", image) + out, err := create.CombinedOutput() + Expect(err).NotTo(HaveOccurred(), "docker create failed: %s", string(out)) + cid := strings.TrimSpace(string(out)) + DeferCleanup(func() { + _ = exec.Command("docker", "rm", "-f", cid).Run() + }) + + // Pipe `docker export ` into `tar -xf - -C dest`. + exp := exec.Command("docker", "export", cid) + expOut, err := exp.StdoutPipe() + Expect(err).NotTo(HaveOccurred()) + exp.Stderr = GinkgoWriter + Expect(exp.Start()).To(Succeed()) + + tar := exec.Command("tar", "-xf", "-", "-C", dest) + tar.Stdin = expOut + tar.Stderr = GinkgoWriter + Expect(tar.Run()).To(Succeed()) + Expect(exp.Wait()).To(Succeed()) +} + +// downloadFile fetches url into dest using curl -L. Used for CI convenience; +// local runs can use BACKEND_TEST_MODEL_FILE to skip downloading. +func downloadFile(url, dest string) { + GinkgoHelper() + cmd := exec.Command("curl", "-sSfL", "-o", dest, url) + cmd.Stdout = GinkgoWriter + cmd.Stderr = GinkgoWriter + Expect(cmd.Run()).To(Succeed(), "failed to download %s", url) + fi, err := os.Stat(dest) + Expect(err).NotTo(HaveOccurred()) + Expect(fi.Size()).To(BeNumerically(">", 1024), "downloaded file is suspiciously small") +} + +func envInt32(name string, def int32) int32 { + raw := os.Getenv(name) + if raw == "" { + return def + } + var v int32 + _, err := fmt.Sscanf(raw, "%d", &v) + if err != nil { + return def + } + return v +} + +func keys(m map[string]bool) []string { + out := make([]string, 0, len(m)) + for k, v := range m { + if v { + out = append(out, k) + } + } + return out +} diff --git a/tests/e2e-backends/suite_test.go b/tests/e2e-backends/suite_test.go new file mode 100644 index 000000000..4ce1864d4 --- /dev/null +++ b/tests/e2e-backends/suite_test.go @@ -0,0 +1,24 @@ +// Package e2ebackends exercises a built backend container image end-to-end over +// its gRPC surface. +// +// The suite is intentionally backend-agnostic: it extracts a Docker image, +// launches the bundled run.sh entrypoint, then drives a configurable set of +// gRPC calls against the result. Specs are gated by capability flags so that a +// non-LLM backend (e.g. image generation, TTS, embeddings-only) can opt in to +// only the RPCs it implements. +// +// Configuration is entirely through environment variables — see backend_test.go +// for the full list. +package e2ebackends_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestBackendE2E(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Backend gRPC End-to-End Suite") +}