From 9ca03cf9ccd6ae86795a216c4f377da2290204ca Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 12 Apr 2026 13:51:28 +0200
Subject: [PATCH] feat(backends): add ik-llama-cpp (#9326)

* feat(backends): add ik-llama-cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore: add grpc e2e suite, hook to CI, update README

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Apply suggestion from @mudler

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>

* Apply suggestion from @mudler

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 .github/workflows/backend.yml                 |   13 +
 .github/workflows/bump_deps.yaml              |    4 +
 .github/workflows/test-extra.yml              |   36 +
 Makefile                                      |   46 +-
 backend/Dockerfile.ik-llama-cpp               |  281 ++
 backend/cpp/ik-llama-cpp/CMakeLists.txt       |   78 +
 backend/cpp/ik-llama-cpp/Makefile             |  167 ++
 backend/cpp/ik-llama-cpp/grpc-server.cpp      | 2652 +++++++++++++++++
 backend/cpp/ik-llama-cpp/package.sh           |   58 +
 .../0001-fix-missing-cstdint-include.patch    |   10 +
 backend/cpp/ik-llama-cpp/prepare.sh           |   49 +
 backend/cpp/ik-llama-cpp/run.sh               |   40 +
 backend/cpp/ik-llama-cpp/utils.hpp            |  483 +++
 backend/index.yaml                            |   29 +
 docs/content/features/text-generation.md      |   41 +
 docs/content/reference/compatibility-table.md |    1 +
 pkg/model/initializers.go                     |    5 +-
 scripts/changed-backends.js                   |    3 +
 tests/e2e-backends/backend_test.go            |  342 +++
 tests/e2e-backends/suite_test.go              |   24 +
 20 files changed, 4360 insertions(+), 2 deletions(-)
 create mode 100644 backend/Dockerfile.ik-llama-cpp
 create mode 100644 backend/cpp/ik-llama-cpp/CMakeLists.txt
 create mode 100644 backend/cpp/ik-llama-cpp/Makefile
 create mode 100644 backend/cpp/ik-llama-cpp/grpc-server.cpp
 create mode 100644 backend/cpp/ik-llama-cpp/package.sh
 create mode 100644 backend/cpp/ik-llama-cpp/patches/0001-fix-missing-cstdint-include.patch
 create mode 100644 backend/cpp/ik-llama-cpp/prepare.sh
 create mode 100644 backend/cpp/ik-llama-cpp/run.sh
 create mode 100644 backend/cpp/ik-llama-cpp/utils.hpp
 create mode 100644 tests/e2e-backends/backend_test.go
 create mode 100644 tests/e2e-backends/suite_test.go

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index e88495d0b..d89ee06bf 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -1945,6 +1945,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.llama-cpp"
             context: "./"
             ubuntu-version: '2404'
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-cpu-ik-llama-cpp'
+            runs-on: 'bigger-runner'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "ik-llama-cpp"
+            dockerfile: "./backend/Dockerfile.ik-llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: 'cublas'
             cuda-major-version: "12"
             cuda-minor-version: "0"
diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml
index 211d3e4ab..0e3dd8d96 100644
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -14,6 +14,10 @@ jobs:
             variable: "LLAMA_VERSION"
             branch: "master"
             file: "backend/cpp/llama-cpp/Makefile"
+          - repository: "ikawrakow/ik_llama.cpp"
+            variable: "IK_LLAMA_VERSION"
+            branch: "main"
+            file: "backend/cpp/ik-llama-cpp/Makefile"
           - repository: "ggml-org/whisper.cpp"
             variable: "WHISPER_CPP_VERSION"
             branch: "master"
diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml
index 0992dfdd9..6b590d156 100644
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -29,6 +29,8 @@ jobs:
       nemo: ${{ steps.detect.outputs.nemo }}
       voxcpm: ${{ steps.detect.outputs.voxcpm }}
       llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }}
+      llama-cpp: ${{ steps.detect.outputs.llama-cpp }}
+      ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
       acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
       qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }}
       voxtral: ${{ steps.detect.outputs.voxtral }}
@@ -465,6 +467,40 @@ jobs:
       - name: Test llama-cpp-quantization
         run: |
           make --jobs=5 --output-sync=target -C backend/python/llama-cpp-quantization test
+  tests-llama-cpp-grpc:
+    needs: detect-changes
+    if: needs.detect-changes.outputs.llama-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
+    runs-on: ubuntu-latest
+    timeout-minutes: 90
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          submodules: true
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.25.4'
+      - name: Build llama-cpp backend image and run gRPC e2e tests
+        run: |
+          make test-extra-backend-llama-cpp
+  tests-ik-llama-cpp-grpc:
+    needs: detect-changes
+    if: needs.detect-changes.outputs.ik-llama-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
+    runs-on: ubuntu-latest
+    timeout-minutes: 90
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          submodules: true
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.25.4'
+      - name: Build ik-llama-cpp backend image and run gRPC e2e tests
+        run: |
+          make test-extra-backend-ik-llama-cpp
   tests-acestep-cpp:
     needs: detect-changes
     if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
diff --git a/Makefile b/Makefile
index 53457b531..6dce83efd 100644
--- a/Makefile
+++ b/Makefile
@@ -456,6 +456,47 @@ test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/trl test
 	$(MAKE) -C backend/rust/kokoros test
 
+##
+## End-to-end gRPC tests that exercise a built backend container image.
+##
+## The test suite in tests/e2e-backends is backend-agnostic. You drive it via env
+## vars (see tests/e2e-backends/backend_test.go for the full list) and the
+## capability-driven harness picks which gRPC RPCs to exercise:
+##
+##   BACKEND_IMAGE            Required. Docker image to test, e.g. local-ai-backend:llama-cpp.
+##   BACKEND_TEST_MODEL_URL   URL of a model file to download and load.
+##   BACKEND_TEST_MODEL_FILE  Path to an already-downloaded model (skips download).
+##   BACKEND_TEST_CAPS        Comma-separated capabilities, default "health,load,predict,stream".
+##   BACKEND_TEST_PROMPT      Override the prompt used in predict/stream specs.
+##
+## Direct usage (image already built, no docker-build-* dependency):
+##
+##   make test-extra-backend BACKEND_IMAGE=local-ai-backend:llama-cpp \
+##       BACKEND_TEST_MODEL_URL=https://.../model.gguf
+##
+## Convenience wrappers below build a specific backend image first, then run the
+## suite against it.
+##
+BACKEND_TEST_MODEL_URL?=https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf
+
+## Generic target — runs the suite against whatever BACKEND_IMAGE points at.
+## Depends on protogen-go so pkg/grpc/proto is generated before `go test`.
+test-extra-backend: protogen-go
+	@test -n "$$BACKEND_IMAGE" || { echo "BACKEND_IMAGE must be set" >&2; exit 1; }
+	BACKEND_IMAGE="$$BACKEND_IMAGE" \
+	BACKEND_TEST_MODEL_URL="$${BACKEND_TEST_MODEL_URL:-$(BACKEND_TEST_MODEL_URL)}" \
+	BACKEND_TEST_MODEL_FILE="$$BACKEND_TEST_MODEL_FILE" \
+	BACKEND_TEST_CAPS="$$BACKEND_TEST_CAPS" \
+	BACKEND_TEST_PROMPT="$$BACKEND_TEST_PROMPT" \
+	go test -v -timeout 15m ./tests/e2e-backends/...
+
+## Convenience wrappers: build the image, then exercise it.
+test-extra-backend-llama-cpp: docker-build-llama-cpp
+	BACKEND_IMAGE=local-ai-backend:llama-cpp $(MAKE) test-extra-backend
+
+test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp
+	BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend
+
 DOCKER_IMAGE?=local-ai
 IMAGE_TYPE?=core
 BASE_IMAGE?=ubuntu:24.04
@@ -549,6 +590,8 @@ backend-images:
 # Backend metadata: BACKEND_NAME | DOCKERFILE_TYPE | BUILD_CONTEXT | PROGRESS_FLAG | NEEDS_BACKEND_ARG
 # llama-cpp is special - uses llama-cpp Dockerfile and doesn't need BACKEND arg
 BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false
+# ik-llama-cpp is a fork of llama.cpp with superior CPU performance
+BACKEND_IK_LLAMA_CPP = ik-llama-cpp|ik-llama-cpp|.|false|false
 
 # Golang backends
 BACKEND_PIPER = piper|golang|.|false|true
@@ -619,6 +662,7 @@ endef
 
 # Generate all docker-build targets
 $(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
+$(eval $(call generate-docker-build-target,$(BACKEND_IK_LLAMA_CPP)))
 $(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
 $(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
@@ -663,7 +707,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP)))
 docker-save-%: backend-images
 	docker save local-ai-backend:$* -o backend-images/$*.tar
 
-docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp
+docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp
 
 ########################################################
 ### Mock Backend for E2E Tests
diff --git a/backend/Dockerfile.ik-llama-cpp b/backend/Dockerfile.ik-llama-cpp
new file mode 100644
index 000000000..62ae52841
--- /dev/null
+++ b/backend/Dockerfile.ik-llama-cpp
@@ -0,0 +1,281 @@
+ARG BASE_IMAGE=ubuntu:24.04
+ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
+
+
+# The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
+# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
+FROM ${GRPC_BASE_IMAGE} AS grpc
+
+# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
+ARG GRPC_MAKEFLAGS="-j4 -Otarget"
+ARG GRPC_VERSION=v1.65.0
+ARG CMAKE_FROM_SOURCE=false
+# CUDA Toolkit 13.x compatibility: CMake 3.31.9+ fixes toolchain detection/arch table issues
+ARG CMAKE_VERSION=3.31.10
+
+ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
+
+WORKDIR /build
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        build-essential curl libssl-dev \
+        git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install CMake (the version in 22.04 is too old)
+RUN <<EOT bash
+    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
+        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
+    else
+        apt-get update && \
+        apt-get install -y \
+            cmake && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
+# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
+# and running make install in the target container
+RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    mkdir -p /build/grpc/cmake/build && \
+    cd /build/grpc/cmake/build && \
+    sed -i "216i\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
+    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
+    make && \
+    make install && \
+    rm -rf /build
+
+FROM ${BASE_IMAGE} AS builder
+ARG CMAKE_FROM_SOURCE=false
+ARG CMAKE_VERSION=3.31.10
+# We can target specific CUDA ARCHITECTURES like --build-arg CUDA_DOCKER_ARCH='75;86;89;120'
+ARG CUDA_DOCKER_ARCH
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+ARG CMAKE_ARGS
+ENV CMAKE_ARGS=${CMAKE_ARGS}
+ARG BACKEND=rerankers
+ARG BUILD_TYPE
+ENV BUILD_TYPE=${BUILD_TYPE}
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
+ARG SKIP_DRIVERS=false
+ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
+ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETARCH
+ARG TARGETVARIANT
+ARG GO_VERSION=1.25.4
+ARG UBUNTU_VERSION=2404
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ccache git \
+        ca-certificates \
+        make \
+        pkg-config libcurl4-openssl-dev \
+        curl unzip \
+        libssl-dev wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Cuda
+ENV PATH=/usr/local/cuda/bin:${PATH}
+
+# HipBLAS requirements
+ENV PATH=/opt/rocm/bin:${PATH}
+
+
+# Vulkan requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils wget gpg-agent && \
+        apt-get install -y libglm-dev cmake libxcb-dri3-0 libxcb-present0 libpciaccess0 \
+            libpng-dev libxcb-keysyms1-dev libxcb-dri3-dev libx11-dev g++ gcc \
+            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
+            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
+            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
+            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
+            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
+            rm vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
+            mkdir -p /opt/vulkan-sdk && \
+            mv 1.4.335.0 /opt/vulkan-sdk/ && \
+            cd /opt/vulkan-sdk/1.4.335.0 && \
+            ./vulkansdk --no-deps --maxjobs \
+                vulkan-loader \
+                vulkan-validationlayers \
+                vulkan-extensionlayer \
+                vulkan-tools \
+                shaderc && \
+            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/bin/* /usr/bin/ && \
+            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/lib/* /usr/lib/x86_64-linux-gnu/ && \
+            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/include/* /usr/include/ && \
+            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/share/* /usr/share/ && \
+            rm -rf /opt/vulkan-sdk
+        fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            mkdir vulkan && cd vulkan && \
+            curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.335.0/vulkansdk-ubuntu-24.04-arm-1.4.335.0.tar.xz && \
+            tar -xvf vulkan-sdk.tar.xz && \
+            rm vulkan-sdk.tar.xz && \
+            cd 1.4.335.0 && \
+            cp -rfv aarch64/bin/* /usr/bin/ && \
+            cp -rfv aarch64/lib/* /usr/lib/aarch64-linux-gnu/ && \
+            cp -rfv aarch64/include/* /usr/include/ && \
+            cp -rfv aarch64/share/* /usr/share/ && \
+            cd ../.. && \
+            rm -rf vulkan
+        fi
+        ldconfig && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# CuBLAS requirements
+RUN <<EOT bash
+    if ( [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "l4t" ] ) && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
+        fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
+                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
+            else
+                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
+            fi
+        fi
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        rm -f cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
+        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
+            apt-get install -y --no-install-recommends \
+            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
+        fi
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+
+# https://github.com/NVIDIA/Isaac-GR00T/issues/343
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
+        wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
+        dpkg -i cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
+        cp /var/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
+        apt-get update && apt-get -y install cudss cudss-cuda-${CUDA_MAJOR_VERSION} && \
+        wget https://developer.download.nvidia.com/compute/nvpl/25.5/local_installers/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
+        dpkg -i nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
+        cp /var/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5/nvpl-*-keyring.gpg /usr/share/keyrings/ && \
+        apt-get update && apt-get install -y nvpl
+    fi
+EOT
+
+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            libclblast-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi
+
+RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            hipblas-dev \
+            rocblas-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
+        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
+        ldconfig \
+    ; fi
+
+RUN echo "TARGETARCH: $TARGETARCH"
+
+# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
+# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
+# here so that we can generate the grpc code for the stablediffusion build
+RUN <<EOT bash
+    if [ "amd64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+    if [ "arm64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+EOT
+
+# Install CMake (the version in 22.04 is too old)
+RUN <<EOT bash
+    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
+        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
+    else
+        apt-get update && \
+        apt-get install -y \
+            cmake && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+COPY --from=grpc /opt/grpc /usr/local
+
+
+COPY . /LocalAI
+
+RUN <<'EOT' bash
+set -euxo pipefail
+
+if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
+  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
+  export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
+  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
+  rm -rf /LocalAI/backend/cpp/ik-llama-cpp-*-build
+fi
+
+cd /LocalAI/backend/cpp/ik-llama-cpp
+
+if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
+  # ARM64 / ROCm: build without x86 SIMD
+  make ik-llama-cpp-fallback
+else
+  # ik_llama.cpp's IQK kernels require at least AVX2
+  make ik-llama-cpp-avx2
+fi
+EOT
+
+
+# Copy libraries using a script to handle architecture differences
+RUN make -BC /LocalAI/backend/cpp/ik-llama-cpp package
+
+
+FROM scratch
+
+
+# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
+COPY --from=builder /LocalAI/backend/cpp/ik-llama-cpp/package/. ./
diff --git a/backend/cpp/ik-llama-cpp/CMakeLists.txt b/backend/cpp/ik-llama-cpp/CMakeLists.txt
new file mode 100644
index 000000000..545dc59db
--- /dev/null
+++ b/backend/cpp/ik-llama-cpp/CMakeLists.txt
@@ -0,0 +1,78 @@
+## Clip/LLaVA library for multimodal support — built locally from copied sources
+set(TARGET myclip)
+add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_include_directories(myclip PUBLIC .)
+target_include_directories(myclip PUBLIC ../..)
+target_include_directories(myclip PUBLIC ../../common)
+target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if (NOT MSVC)
+    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual)
+endif()
+
+set(TARGET grpc-server)
+set(CMAKE_CXX_STANDARD 17)
+cmake_minimum_required(VERSION 3.15)
+set(TARGET grpc-server)
+set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+set(_REFLECTION grpc++_reflection)
+
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
+        set(HOMEBREW_DEFAULT_PREFIX "/opt/homebrew")
+    else()
+        set(HOMEBREW_DEFAULT_PREFIX "/usr/local")
+    endif()
+    link_directories("${HOMEBREW_DEFAULT_PREFIX}/lib")
+    include_directories("${HOMEBREW_DEFAULT_PREFIX}/include")
+endif()
+
+find_package(absl CONFIG REQUIRED)
+find_package(Protobuf CONFIG REQUIRED)
+find_package(gRPC CONFIG REQUIRED)
+
+find_program(_PROTOBUF_PROTOC protoc)
+set(_GRPC_GRPCPP grpc++)
+find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${Protobuf_INCLUDE_DIRS})
+
+message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
+
+# Proto file
+get_filename_component(hw_proto "../../../../../../backend/backend.proto" ABSOLUTE)
+get_filename_component(hw_proto_path "${hw_proto}" PATH)
+
+set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
+set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
+set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
+set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
+
+add_custom_command(
+      OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
+      COMMAND ${_PROTOBUF_PROTOC}
+      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+        -I "${hw_proto_path}"
+        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+        "${hw_proto}"
+      DEPENDS "${hw_proto}")
+
+add_library(hw_grpc_proto
+  ${hw_grpc_srcs}
+  ${hw_grpc_hdrs}
+  ${hw_proto_srcs}
+  ${hw_proto_hdrs} )
+
+add_executable(${TARGET} grpc-server.cpp json.hpp)
+target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+  absl::flags_parse
+  gRPC::${_REFLECTION}
+  gRPC::${_GRPC_GRPCPP}
+  protobuf::${_PROTOBUF_LIBPROTOBUF})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/backend/cpp/ik-llama-cpp/Makefile b/backend/cpp/ik-llama-cpp/Makefile
new file mode 100644
index 000000000..b9639e96b
--- /dev/null
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -0,0 +1,167 @@
+
+IK_LLAMA_VERSION?=08ae48c667e3dcd3025821a8585190b4a46c2f7c
+LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
+
+CMAKE_ARGS?=
+BUILD_TYPE?=
+NATIVE?=false
+ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
+TARGET?=--target grpc-server
+JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
+ARCH?=$(shell uname -m)
+
+# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+
+CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF -DLLAMA_OPENSSL=OFF
+endif
+# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DGGML_CUDA=ON
+# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblas)
+	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
+else ifeq ($(BUILD_TYPE),hipblas)
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	AMDGPU_TARGETS?=gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
+	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DGGML_VULKAN=1
+else ifeq ($(OS),Darwin)
+	ifeq ($(BUILD_TYPE),)
+		BUILD_TYPE=metal
+	endif
+	ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DGGML_METAL=OFF
+	else
+		CMAKE_ARGS+=-DGGML_METAL=ON
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+		CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
+		CMAKE_ARGS+=-DGGML_OPENMP=OFF
+	endif
+	TARGET+=--target ggml-metal
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f16)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl" \
+		-DGGML_SYCL_F16=ON
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f32)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl"
+endif
+
+INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
+INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
+ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
+				 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
+				 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
+				 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
+				 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
+build-ik-llama-cpp-grpc-server:
+# Conditionally build grpc for the backend to use if needed
+ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
+	$(MAKE) -C ../../grpc build
+	_PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \
+	_GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \
+	PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
+	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
+	IK_LLAMA_VERSION=$(IK_LLAMA_VERSION) \
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
+else
+	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
+	IK_LLAMA_VERSION=$(IK_LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
+endif
+
+ik-llama-cpp-avx2: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx2-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx2-build purge
+	$(info ${GREEN}I ik-llama-cpp build info:avx2${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="ik-llama-cpp-avx2-build" build-ik-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx2-build/grpc-server ik-llama-cpp-avx2
+
+ik-llama-cpp-avx512: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx512-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx512-build purge
+	$(info ${GREEN}I ik-llama-cpp build info:avx512${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="ik-llama-cpp-avx512-build" build-ik-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx512-build/grpc-server ik-llama-cpp-avx512
+
+ik-llama-cpp-avx: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx-build purge
+	$(info ${GREEN}I ik-llama-cpp build info:avx${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="ik-llama-cpp-avx-build" build-ik-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx-build/grpc-server ik-llama-cpp-avx
+
+ik-llama-cpp-fallback: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-fallback-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-fallback-build purge
+	$(info ${GREEN}I ik-llama-cpp build info:fallback${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="ik-llama-cpp-fallback-build" build-ik-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-fallback-build/grpc-server ik-llama-cpp-fallback
+
+ik-llama-cpp-grpc: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build purge
+	$(info ${GREEN}I ik-llama-cpp build info:grpc${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="ik-llama-cpp-grpc-build" build-ik-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build/grpc-server ik-llama-cpp-grpc
+
+ik-llama-cpp-rpc-server: ik-llama-cpp-grpc
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server ik-llama-cpp-rpc-server
+
+llama.cpp:
+	mkdir -p llama.cpp
+	cd llama.cpp && \
+	git init && \
+	git remote add origin $(LLAMA_REPO)  && \
+	git fetch origin && \
+	git checkout -b build $(IK_LLAMA_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+llama.cpp/examples/grpc-server: llama.cpp
+	mkdir -p llama.cpp/examples/grpc-server
+	bash prepare.sh
+
+rebuild:
+	bash prepare.sh
+	rm -rf grpc-server
+	$(MAKE) grpc-server
+
+package:
+	bash package.sh
+
+purge:
+	rm -rf llama.cpp/build
+	rm -rf llama.cpp/examples/grpc-server
+	rm -rf grpc-server
+
+clean: purge
+	rm -rf llama.cpp
+
+grpc-server: llama.cpp llama.cpp/examples/grpc-server
+	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	+bash -c "source $(ONEAPI_VARS); \
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)"
+else
+	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)
+endif
+	cp llama.cpp/build/bin/grpc-server .
diff --git a/backend/cpp/ik-llama-cpp/grpc-server.cpp b/backend/cpp/ik-llama-cpp/grpc-server.cpp
new file mode 100644
index 000000000..3e88022dc
--- /dev/null
+++ b/backend/cpp/ik-llama-cpp/grpc-server.cpp
@@ -0,0 +1,2652 @@
+// ik_llama.cpp gRPC C++ backend server
+//
+// Ettore Di Giacinto <mudler@localai.io> and llama.cpp authors
+//
+// This is a gRPC server for ik_llama.cpp compatible with the LocalAI proto
+// Note: this is a re-adaptation of the original llama.cpp example/server.cpp for HTTP (https://github.com/ggerganov/llama.cpp/tree/master/examples/server),
+// but modified to work with gRPC
+//
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <getopt.h>
+#include "clip.h"
+#include "llava.h"
+#include "log.h"
+#include "common.h"
+#include "json.hpp"
+#include "llama.h"
+#include "backend.pb.h"
+#include "backend.grpc.pb.h"
+#include "utils.hpp"
+#include "sampling.h"
+// include std::regex
+#include <cstddef>
+#include <thread>
+#include <mutex>
+#include <chrono>
+#include <regex>
+#include <condition_variable>
+#include <grpcpp/ext/proto_server_reflection_plugin.h>
+#include <grpcpp/grpcpp.h>
+#include <grpcpp/health_check_service_interface.h>
+#include <atomic>
+#include <signal.h>
+
+using grpc::Server;
+using grpc::ServerBuilder;
+using grpc::ServerContext;
+using grpc::Status;
+
+
+using backend::HealthMessage;
+
+
+///// LLAMA.CPP server code below
+
+using json = nlohmann::json;
+
+struct server_params
+{
+    std::string hostname = "127.0.0.1";
+    std::vector<std::string> api_keys;
+    std::string public_path = "examples/server/public";
+    std::string chat_template = "";
+    int32_t port = 8080;
+    int32_t read_timeout = 600;
+    int32_t write_timeout = 600;
+    bool slots_endpoint = true;
+    bool metrics_endpoint = false;
+};
+
+bool server_verbose = false;
+bool server_log_json = true;
+
+static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
+{
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
+    {
+    }
+    return i;
+}
+
+enum stop_type
+{
+    STOP_FULL,
+    STOP_PARTIAL,
+};
+
+static bool ends_with(const std::string &str, const std::string &suffix)
+{
+    return str.size() >= suffix.size() &&
+           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
+}
+
+static size_t find_partial_stop_string(const std::string &stop,
+                                       const std::string &text)
+{
+    if (!text.empty() && !stop.empty())
+    {
+        const char text_last_char = text.back();
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
+        {
+            if (stop[char_index] == text_last_char)
+            {
+                const std::string current_partial = stop.substr(0, char_index + 1);
+                if (ends_with(text, current_partial))
+                {
+                    return text.size() - char_index - 1;
+                }
+            }
+        }
+    }
+    return std::string::npos;
+}
+
+// TODO: reuse llama_detokenize
+template <class Iter>
+static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
+{
+    std::string ret;
+    for (; begin != end; ++begin)
+    {
+        ret += common_token_to_piece(ctx, *begin);
+    }
+    return ret;
+}
+
+// format incomplete utf-8 multibyte character for output
+static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
+{
+    std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
+    // if the size is 1 and first bit is 1, meaning it's a partial character
+    //   (size > 1 meaning it's already a known token)
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
+    {
+        std::stringstream ss;
+        ss << std::hex << (out[0] & 0xff);
+        std::string res(ss.str());
+        out = "byte: \\x" + res;
+    }
+    return out;
+}
+
+// Adds an RPC server
+// NOTE: RPC device API is not available in ik_llama.cpp -- this function is a no-op stub.
+static void add_rpc_devices(std::string servers) {
+    LOG_WARNING("RPC devices are not supported in ik_llama.cpp, ignoring LLAMACPP_GRPC_SERVERS", {});
+}
+
+// convert a vector of completion_token_output to json
+static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
+{
+    json out = json::array();
+    for (const auto &prob : probs)
+    {
+        json probs_for_token = json::array();
+        for (const auto &p : prob.probs)
+        {
+            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
+            probs_for_token.push_back(json
+            {
+                {"tok_str", tok_str},
+                {"prob",    p.prob},
+            });
+        }
+        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
+        out.push_back(json{
+            {"content", tok_str},
+            {"probs",   probs_for_token},
+        });
+    }
+    return out;
+}
+
+struct llama_client_slot
+{
+    int id;
+    int task_id = -1;
+
+    struct slot_params params;
+
+    slot_state state = IDLE;
+    slot_command command = NONE;
+
+    // used to determine the slot that has been used the longest
+    int64_t t_last_used = -1;
+
+    // generation props
+    int32_t n_ctx       = 0;  // context size per slot
+    int32_t n_past      = 0;
+    int32_t n_decoded   = 0;
+    int32_t n_remaining = -1;
+    int32_t i_batch     = -1;
+    int32_t n_predict   = -1;
+
+    int32_t num_prompt_tokens           = 0;
+    int32_t num_prompt_tokens_processed = 0;
+
+    json prompt;
+    std::string generated_text;
+    llama_token sampled;
+    std::vector<llama_token> cache_tokens;
+    std::vector<completion_token_output> generated_token_probs;
+
+    bool infill = false;
+    bool embedding = false;
+    bool has_next_token = true;
+    bool truncated = false;
+    bool stopped_eos = false;
+    bool stopped_word = false;
+    bool stopped_limit = false;
+
+    bool oaicompat = false;
+    std::string oaicompat_model;
+
+    std::string stopping_word;
+
+    // sampling
+    struct common_params_sampling sparams;
+    common_sampler *ctx_sampling = nullptr;
+
+    int32_t ga_i = 0;   // group-attention state
+    int32_t ga_n = 1;   // group-attention factor
+    int32_t ga_w = 512; // group-attention width
+
+    int32_t n_past_se = 0; // self-extend
+
+    // multimodal
+    std::vector<slot_image> images;
+
+    // stats
+    size_t sent_count = 0;
+    size_t sent_token_probs_index = 0;
+
+    int64_t t_start_process_prompt;
+    int64_t t_start_genereration;
+
+    double t_prompt_processing; // ms
+    double t_token_generation; // ms
+
+    // multitasks
+    int multitask_id = -1;
+
+    void reset() {
+        num_prompt_tokens      = 0;
+        generated_text         = "";
+        truncated              = false;
+        stopped_eos            = false;
+        stopped_word           = false;
+        stopped_limit          = false;
+        stopping_word          = "";
+        n_past                 = 0;
+        sent_count             = 0;
+        sent_token_probs_index = 0;
+        infill                 = false;
+        ga_i                   = 0;
+        n_past_se              = 0;
+
+        generated_token_probs.clear();
+
+        for (slot_image & img : images)
+        {
+            free(img.image_embedding);
+            if (img.img_data) {
+                clip_image_u8_free(img.img_data);
+            }
+            img.prefix_prompt = "";
+        }
+
+        images.clear();
+    }
+
+    bool has_budget(gpt_params &global_params) {
+        if (params.n_predict == -1 && global_params.n_predict == -1)
+        {
+            return true; // limitless
+        }
+
+        n_remaining = -1;
+
+        if (params.n_predict != -1)
+        {
+            n_remaining = params.n_predict - n_decoded;
+        }
+        else if (global_params.n_predict != -1)
+        {
+            n_remaining = global_params.n_predict - n_decoded;
+        }
+
+        return n_remaining > 0; // no budget
+    }
+
+    bool available() const {
+        return state == IDLE && command == NONE;
+    }
+
+    bool is_processing() const {
+        return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
+    }
+
+    void add_token_string(const completion_token_output &token) {
+        if (command == RELEASE)
+        {
+            return;
+        }
+        cache_tokens.push_back(token.tok);
+        generated_token_probs.push_back(token);
+    }
+
+    void release() {
+        if (state == PROCESSING)
+        {
+            t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
+            command = RELEASE;
+        }
+    }
+
+    json get_formated_timings() {
+        return json
+        {
+            {"prompt_n",               num_prompt_tokens_processed},
+            {"prompt_ms",              t_prompt_processing},
+            {"prompt_per_token_ms",    t_prompt_processing / num_prompt_tokens_processed},
+            {"prompt_per_second",      1e3 / t_prompt_processing * num_prompt_tokens_processed},
+
+            {"predicted_n",            n_decoded},
+            {"predicted_ms",           t_token_generation},
+            {"predicted_per_token_ms", t_token_generation / n_decoded},
+            {"predicted_per_second",   1e3 / t_token_generation * n_decoded},
+        };
+    }
+
+    void print_timings() const {
+       char buffer[512];
+        double t_token = t_prompt_processing / num_prompt_tokens_processed;
+        double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
+        sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
+                t_prompt_processing, num_prompt_tokens_processed,
+                t_token, n_tokens_second);
+        LOG_INFO(buffer, {
+            {"slot_id",                     id},
+            {"task_id",                     task_id},
+            {"t_prompt_processing",         t_prompt_processing},
+            {"num_prompt_tokens_processed", num_prompt_tokens_processed},
+            {"t_token",                     t_token},
+            {"n_tokens_second",             n_tokens_second},
+        });
+
+        t_token = t_token_generation / n_decoded;
+        n_tokens_second = 1e3 / t_token_generation * n_decoded;
+        sprintf(buffer, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
+                t_token_generation, n_decoded,
+                t_token, n_tokens_second);
+        LOG_INFO(buffer, {
+            {"slot_id",            id},
+            {"task_id",            task_id},
+            {"t_token_generation", t_token_generation},
+            {"n_decoded",          n_decoded},
+            {"t_token",            t_token},
+            {"n_tokens_second",    n_tokens_second},
+        });
+
+        sprintf(buffer, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
+        LOG_INFO(buffer, {
+            {"slot_id",             id},
+            {"task_id",             task_id},
+            {"t_prompt_processing", t_prompt_processing},
+            {"t_token_generation",  t_token_generation},
+            {"t_total",             t_prompt_processing + t_token_generation},
+        });
+    }
+};
+
+struct llama_metrics {
+    uint64_t n_prompt_tokens_processed_total = 0;
+    uint64_t n_tokens_predicted_total        = 0;
+
+    uint64_t n_prompt_tokens_processed = 0;
+    uint64_t t_prompt_processing       = 0;
+
+    uint64_t n_tokens_predicted       = 0;
+    uint64_t t_tokens_generation      = 0;
+
+
+    void on_prompt_eval(const llama_client_slot &slot) {
+        n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
+
+        n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
+        t_prompt_processing       += slot.t_prompt_processing;
+    }
+
+    void on_prediction(const llama_client_slot &slot) {
+        n_tokens_predicted_total += slot.n_decoded;
+
+        n_tokens_predicted  += slot.n_decoded;
+        t_tokens_generation += slot.t_token_generation;
+    }
+
+    void reset_bucket() {
+        n_prompt_tokens_processed = 0;
+        t_prompt_processing       = 0;
+        n_tokens_predicted        = 0;
+        t_tokens_generation       = 0;
+    }
+};
+
+struct llava_embd_batch {
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id *> seq_ids;
+    std::vector<int8_t>         logits;
+    llama_batch batch;
+    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+        pos     .resize(n_tokens);
+        n_seq_id.resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        logits  .resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
+        seq_ids [n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ embd,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+        for (int i = 0; i < n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+};
+
+struct llama_server_context
+{
+    llama_model *model = nullptr;
+    llama_context *ctx = nullptr;
+    const llama_vocab * vocab = nullptr;
+
+    clip_ctx *clp_ctx = nullptr;
+
+    gpt_params params;
+
+    llama_batch batch;
+
+    bool multimodal         = false;
+    bool clean_kv_cache     = true;
+    bool all_slots_are_idle = false;
+    bool add_bos_token      = true;
+    bool has_eos_token      = true;
+    bool has_gpu = false;
+
+    bool grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_triggers;
+
+    int32_t n_ctx;  // total context for all clients / slots
+
+    // system prompt
+    bool system_need_update = false;
+
+    std::string              system_prompt;
+    std::vector<llama_token> system_tokens;
+
+    std::string name_user;      // this should be the antiprompt
+    std::string name_assistant;
+
+    // slots / clients
+    std::vector<llama_client_slot> slots;
+    json default_generation_settings_for_props;
+
+    llama_server_queue queue_tasks;
+    llama_server_response queue_results;
+
+    llama_metrics metrics;
+
+    ~llama_server_context()
+    {
+        if (ctx)
+        {
+            llama_free(ctx);
+            ctx = nullptr;
+        }
+        if (model)
+        {
+            llama_free_model(model);
+            model = nullptr;
+        }
+    }
+
+    bool load_model(const gpt_params &params_)
+    {
+        params = params_;
+        if (!params.mmproj.path.empty()) {
+            multimodal = true;
+            LOG_INFO("Multi Modal Mode Enabled", {});
+            clp_ctx = clip_model_load(params.mmproj.path.c_str(), /*verbosity=*/ 1);
+            if(clp_ctx == nullptr) {
+                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
+                return false;
+            }
+
+            if (params.n_ctx < 2048) { // request larger context for the image embedding
+                params.n_ctx = 2048;
+            }
+        }
+
+        llama_init_result init_result = llama_init_from_gpt_params(params);
+        model = init_result.model;
+        ctx = init_result.context;
+        if (model == nullptr)
+        {
+            LOG_ERR("unable to load model: %s", params.model.c_str());
+            return false;
+        }
+
+        if (multimodal) {
+            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
+            const int n_embd_llm  = llama_model_n_embd(model);
+            if (n_embd_clip != n_embd_llm) {
+                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+                llama_free(ctx);
+                llama_free_model(model);
+                return false;
+            }
+        }
+
+        vocab = llama_model_get_vocab(model);
+        n_ctx = llama_n_ctx(ctx);
+
+        add_bos_token = llama_vocab_get_add_bos(vocab);
+        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+
+        return true;
+    }
+
+    llama_client_slot* get_active_slot() {
+        for (llama_client_slot& slot : slots) {
+            // Check if the slot is currently processing
+            if (slot.is_processing()) {
+                return &slot;  // Return the active slot
+            }
+        }
+        return nullptr;  // No active slot found
+    }
+
+    void initialize() {
+        // create slots
+        all_slots_are_idle = true;
+
+        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
+
+        LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
+        for (int i = 0; i < params.n_parallel; i++)
+        {
+            llama_client_slot slot;
+
+            slot.id = i;
+            slot.n_ctx = n_ctx_slot;
+            slot.n_predict = params.n_predict;
+
+            LOG_INFO("new slot", {
+                {"slot_id",    slot.id},
+                {"n_ctx_slot", slot.n_ctx}
+            });
+
+            const int ga_n = params.grp_attn_n;
+            const int ga_w = params.grp_attn_w;
+
+            if (ga_n != 1) {
+                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                     // NOLINT
+                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");     // NOLINT
+                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
+                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
+
+                LOG_INFO("slot self-extend", {
+                    {"slot_id",   slot.id},
+                    {"ga_n",      ga_n},
+                    {"ga_w",      ga_w}
+                });
+            }
+
+            slot.ga_i = 0;
+            slot.ga_n = ga_n;
+            slot.ga_w = ga_w;
+
+            slot.reset();
+
+            slots.push_back(slot);
+        }
+
+        default_generation_settings_for_props = get_formated_generation(slots.front());
+        default_generation_settings_for_props["seed"] = -1;
+
+        batch = llama_batch_init(n_ctx, 0, params.n_parallel);
+    }
+
+    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
+    {
+        // TODO: currently, we tokenize using special tokens by default
+        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
+        //       but it's better compared to completely ignoring ChatML and other chat templates
+        const bool TMP_FORCE_SPECIAL = true;
+
+        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+        // or the first element of the json_prompt array is a string.
+        std::vector<llama_token> prompt_tokens;
+
+        if (json_prompt.is_array())
+        {
+            bool first = true;
+            for (const auto& p : json_prompt)
+            {
+                if (p.is_string())
+                {
+                    auto s = p.template get<std::string>();
+                    std::vector<llama_token> p;
+                    if (first)
+                    {
+                        p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+                        first = false;
+                    }
+                    else
+                    {
+                        p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
+                    }
+                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+                }
+                else
+                {
+                    if (first)
+                    {
+                        first = false;
+                    }
+                    prompt_tokens.push_back(p.template get<llama_token>());
+                }
+            }
+        }
+        else
+        {
+            auto s = json_prompt.template get<std::string>();
+            prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+        }
+
+        return prompt_tokens;
+    }
+
+    llama_client_slot* get_slot(int id) {
+        int64_t t_last = ggml_time_us();
+        llama_client_slot *last_used = nullptr;
+
+        for (llama_client_slot & slot : slots)
+        {
+            if (slot.id == id && slot.available())
+            {
+                return &slot;
+            }
+
+            if (slot.available() && slot.t_last_used < t_last)
+            {
+                last_used = &slot;
+                t_last = slot.t_last_used;
+            }
+        }
+
+        return last_used;
+    }
+
+    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
+        slot_params default_params;
+        common_params_sampling default_sparams;
+
+        slot->params.stream             = json_value(data, "stream",            false);
+        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
+        slot->params.n_predict          = json_value(data, "n_predict",         default_params.n_predict);
+        slot->sparams.top_k             = json_value(data, "top_k",             default_sparams.top_k);
+        slot->sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
+        slot->sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
+        slot->sparams.typical_p         = json_value(data, "typical_p",         default_sparams.typical_p);
+        slot->sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
+        slot->sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
+        slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
+        slot->sparams.penalty_last_n    = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
+        slot->sparams.penalty_repeat    = json_value(data, "repeat_penalty",    default_sparams.penalty_repeat);
+        slot->sparams.penalty_freq      = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
+        slot->sparams.penalty_present   = json_value(data, "presence_penalty",  default_sparams.penalty_present);
+        slot->sparams.mirostat          = json_value(data, "mirostat",          default_sparams.mirostat);
+        slot->sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
+        slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
+        slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
+        slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
+        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
+        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
+        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
+        slot->sparams.grammar_triggers = grammar_triggers;
+        slot->sparams.grammar_lazy = grammar_lazy;
+
+        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
+            // Might be better to reject the request with a 400 ?
+            LOG_WARNING("Max tokens to predict exceeds server configuration", {
+                {"params.n_predict", slot->params.n_predict},
+                {"slot.n_predict", slot->n_predict},
+            });
+            slot->params.n_predict = slot->n_predict;
+        }
+
+        // infill
+        if (data.count("input_prefix") != 0)
+        {
+            slot->params.input_prefix = data["input_prefix"];
+        }
+        else
+        {
+            slot->params.input_prefix = "";
+        }
+
+
+        if (data.count("input_suffix") != 0)
+        {
+            slot->params.input_suffix = data["input_suffix"];
+        }
+        else
+        {
+            slot->params.input_suffix = "";
+        }
+
+        if (data.count("prompt") != 0)
+        {
+            slot->prompt = data["prompt"];
+        }
+        else
+        {
+            slot->prompt = "";
+        }
+
+        if (json_value(data, "ignore_eos", false) && has_eos_token) {
+                slot->sparams.logit_bias[llama_vocab_eos(vocab)] = -INFINITY;
+        }
+        /*
+        slot->sparams.penalty_prompt_tokens.clear();
+        slot->sparams.use_penalty_prompt_tokens = false;
+        const auto &penalty_prompt = data.find("penalty_prompt");
+        if (penalty_prompt != data.end())
+        {
+            if (penalty_prompt->is_string())
+            {
+                const auto penalty_prompt_string = penalty_prompt->get<std::string>();
+                auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
+                slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
+                if (slot->params.n_predict > 0)
+                {
+                    slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
+                }
+                slot->sparams.use_penalty_prompt_tokens = true;
+            }
+            else if (penalty_prompt->is_array())
+            {
+                const auto n_tokens = penalty_prompt->size();
+                slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
+                const int n_vocab = llama_n_vocab(model);
+                for (const auto &penalty_token : *penalty_prompt)
+                {
+                    if (penalty_token.is_number_integer())
+                    {
+                        const auto tok = penalty_token.get<llama_token>();
+                        if (tok >= 0 && tok < n_vocab)
+                        {
+                            slot->sparams.penalty_prompt_tokens.push_back(tok);
+                        }
+                    }
+                }
+                slot->sparams.use_penalty_prompt_tokens = true;
+            }
+        }
+      */
+        slot->sparams.logit_bias.clear();
+
+        const auto &logit_bias = data.find("logit_bias");
+        if (logit_bias != data.end() && logit_bias->is_array())
+        {
+            const llama_vocab * vocab = llama_model_get_vocab(model);
+            const int n_vocab = llama_vocab_n_tokens(vocab);
+            for (const auto &el : *logit_bias)
+            {
+                if (el.is_array() && el.size() == 2)
+                {
+                    float bias;
+                    if (el[1].is_number())
+                    {
+                        bias = el[1].get<float>();
+                    }
+                    else if (el[1].is_boolean() && !el[1].get<bool>())
+                    {
+                        bias = -INFINITY;
+                    }
+                    else
+                    {
+                        continue;
+                    }
+
+                    if (el[0].is_number_integer())
+                    {
+                        llama_token tok = el[0].get<llama_token>();
+                        if (tok >= 0 && tok < n_vocab)
+                        {
+                            slot->sparams.logit_bias[tok] = bias;
+                        }
+                    }
+                    else if (el[0].is_string())
+                    {
+                        auto toks = common_tokenize(ctx, el[0].get<std::string>(), false, false);
+                        for (auto tok : toks)
+                        {
+                            slot->sparams.logit_bias[tok] = bias;
+                        }
+                    }
+                }
+            }
+        }
+
+        slot->params.antiprompt.clear();
+
+        const auto &stop = data.find("stop");
+        if (stop != data.end() && stop->is_array())
+        {
+            for (const auto &word : *stop)
+            {
+                if (!word.empty())
+                {
+                    slot->params.antiprompt.push_back(word);
+                }
+            }
+        }
+
+        const auto & samplers = data.find("samplers");
+        if (samplers != data.end() && samplers->is_array()) {
+            std::vector<std::string> sampler_names;
+                for (const auto & name : *samplers) {
+                    if (name.is_string()) {
+                        sampler_names.emplace_back(name);
+                    }
+                }
+                slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
+        }
+        else
+        {
+                slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
+        }
+
+
+        if (multimodal)
+        {
+            const auto &images_data = data.find("image_data");
+            if (images_data != data.end() && images_data->is_array())
+            {
+                for (const auto &img : *images_data)
+                {
+                    const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
+
+                    slot_image img_sl;
+                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
+                    img_sl.img_data = clip_image_u8_init();
+                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
+                    {
+                        LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
+                             __func__,
+                             slot->id,
+                             img_sl.id
+                        );
+                        return false;
+                    }
+                    LOG_VERBOSE("image loaded", {
+                        {"slot_id",   slot->id},
+                        {"img_sl_id", img_sl.id}
+                    });
+                    img_sl.request_encode_image = true;
+                    slot->images.push_back(img_sl);
+                }
+                // process prompt
+                // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
+                if (slot->images.size() > 0 && !slot->prompt.is_array())
+                {
+                    std::string prompt = slot->prompt.get<std::string>();
+                    size_t pos = 0, begin_prefix = 0;
+                    std::string pattern = "[img-";
+                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
+                        size_t end_prefix = pos;
+                        pos += pattern.length();
+                        size_t end_pos = prompt.find(']', pos);
+                        if (end_pos != std::string::npos)
+                        {
+                            std::string image_id = prompt.substr(pos, end_pos - pos);
+                            try
+                            {
+                                int img_id = std::stoi(image_id);
+                                bool found = false;
+                                for (slot_image &img : slot->images)
+                                {
+                                    if (img.id == img_id) {
+                                        found = true;
+                                        img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
+                                        begin_prefix = end_pos + 1;
+                                        break;
+                                    }
+                                }
+                                if (!found) {
+                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
+                                    slot->images.clear();
+                                    return false;
+                                }
+                            } catch (const std::invalid_argument& e) {
+                                LOG("Invalid image number id in prompt\n");
+                                slot->images.clear();
+                                return false;
+                            }
+                        }
+                    }
+                    slot->prompt = "";
+                    slot->params.input_suffix = prompt.substr(begin_prefix);
+                    slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
+                }
+            }
+        }
+
+        if (slot->ctx_sampling != nullptr)
+        {
+            common_sampler_free(slot->ctx_sampling);
+        }
+        slot->ctx_sampling = common_sampler_init(model, slot->sparams);
+        //llama_set_rng_seed(ctx, slot->params.seed);
+        slot->command = LOAD_PROMPT;
+
+        all_slots_are_idle = false;
+
+        LOG_INFO("slot is processing task", {
+            {"slot_id", slot->id},
+            {"task_id", slot->task_id},
+        });
+
+      //  LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
+
+        return true;
+    }
+
+    void kv_cache_clear() {
+        // clear the entire KV cache
+        llama_kv_cache_clear(ctx);
+        clean_kv_cache = false;
+    }
+
+    void update_system_prompt() {
+        kv_cache_clear();
+        system_tokens.clear();
+
+        if (!system_prompt.empty()) {
+            system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);
+
+            common_batch_clear(batch);
+
+            for (int i = 0; i < (int)system_tokens.size(); ++i)
+            {
+                common_batch_add(batch, system_tokens[i], i, { 0 }, false);
+            }
+
+            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
+            {
+                const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
+                llama_batch batch_view = {
+                    n_tokens,
+                    batch.token    + i,
+                    nullptr,
+                    batch.pos      + i,
+                    batch.n_seq_id + i,
+                    batch.seq_id   + i,
+                    batch.logits   + i,
+                };
+                if (llama_decode(ctx, batch_view) != 0)
+                {
+                    LOG("%s: llama_decode() failed\n", __func__);
+                    return;
+                }
+            }
+
+            // assign the system KV cache to all parallel sequences
+            for (int32_t i = 1; i < params.n_parallel; ++i)
+            {
+                llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
+            }
+        }
+
+        LOG("system prompt updated\n");
+        system_need_update = false;
+    }
+
+    void notify_system_prompt_changed() {
+        // release all slots
+        for (llama_client_slot &slot : slots)
+        {
+            slot.release();
+        }
+
+        system_need_update = true;
+    }
+
+    void process_system_prompt_data(const json &sys_props) {
+        system_prompt  = sys_props.value("prompt", "");
+        name_user      = sys_props.value("anti_prompt", "");
+        name_assistant = sys_props.value("assistant_name", "");
+
+
+        notify_system_prompt_changed();
+    }
+
+    static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
+                                        const stop_type type, llama_client_slot &slot)
+    {
+        size_t stop_pos = std::string::npos;
+
+        for (const std::string &word : slot.params.antiprompt)
+        {
+            size_t pos;
+            if (type == STOP_FULL)
+            {
+                const size_t tmp = word.size() + last_token_size;
+                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+                pos = text.find(word, from_pos);
+            }
+            else
+            {
+                pos = find_partial_stop_string(word, text);
+            }
+            if (pos != std::string::npos &&
+                (stop_pos == std::string::npos || pos < stop_pos))
+            {
+                if (type == STOP_FULL)
+                {
+                    slot.stopped_word = true;
+                    slot.stopping_word = word;
+                    slot.has_next_token = false;
+                }
+                stop_pos = pos;
+            }
+        }
+
+        return stop_pos;
+    }
+
+    bool process_token(completion_token_output &result, llama_client_slot &slot) {
+        // remember which tokens were sampled - used for repetition penalties during sampling
+        const std::string token_str = common_token_to_piece(ctx, result.tok);
+        slot.sampled = result.tok;
+
+        // search stop word and delete it
+        slot.generated_text += token_str;
+        slot.has_next_token = true;
+
+/*
+        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
+        {
+            // we can change penalty_prompt_tokens because it is always created from scratch each request
+            slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
+        }
+        */
+
+        // check if there is incomplete UTF-8 character at the end
+        bool incomplete = false;
+        for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
+        {
+            unsigned char c = slot.generated_text[slot.generated_text.size() - i];
+            if ((c & 0xC0) == 0x80)
+            {
+                // continuation byte: 10xxxxxx
+                continue;
+            }
+            if ((c & 0xE0) == 0xC0)
+            {
+                // 2-byte character: 110xxxxx ...
+                incomplete = i < 2;
+            }
+            else if ((c & 0xF0) == 0xE0)
+            {
+                // 3-byte character: 1110xxxx ...
+                incomplete = i < 3;
+            }
+            else if ((c & 0xF8) == 0xF0)
+            {
+                // 4-byte character: 11110xxx ...
+                incomplete = i < 4;
+            }
+            // else 1-byte character or invalid byte
+            break;
+        }
+
+        if (!incomplete)
+        {
+            size_t pos = std::min(slot.sent_count, slot.generated_text.size());
+            const std::string str_test = slot.generated_text.substr(pos);
+            bool is_stop_full = false;
+            size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
+            if (stop_pos != std::string::npos)
+            {
+                is_stop_full = true;
+                slot.generated_text.erase(
+                    slot.generated_text.begin() + pos + stop_pos,
+                    slot.generated_text.end());
+                pos = std::min(slot.sent_count, slot.generated_text.size());
+            }
+            else
+            {
+                is_stop_full = false;
+                stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
+            }
+
+            // check if there is any token to predict
+            if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
+            {
+                // no send the stop word in the response
+                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+                slot.sent_count += result.text_to_send.size();
+                // add the token to slot queue and cache
+            }
+            slot.add_token_string(result);
+            if (slot.params.stream)
+            {
+                send_partial_response(slot, result);
+            }
+        }
+
+        if (incomplete)
+        {
+            slot.has_next_token = true;
+        }
+
+        // check the limits
+        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params))
+        {
+            slot.stopped_limit = true;
+            slot.has_next_token = false;
+        }
+
+        if (slot.n_past >= slot.n_ctx) {
+            slot.truncated      = true;
+            slot.stopped_limit = true;
+            slot.has_next_token = false;
+
+            LOG_VERBOSE("stopped due to running out of context capacity", {});
+        }
+
+        if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
+        {
+            slot.stopped_eos = true;
+            slot.has_next_token = false;
+            LOG_VERBOSE("eos token found", {});
+        }
+
+        LOG_VERBOSE("next token", {
+                                      {"token", result.tok},
+                                      {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
+                                      {"has_next_token", slot.has_next_token},
+                                      {"n_remain", slot.n_remaining},
+                                      {"num_tokens_predicted", slot.n_decoded},
+                                      {"stopped_eos", slot.stopped_eos},
+                                      {"stopped_word", slot.stopped_word},
+                                      {"stopped_limit", slot.stopped_limit},
+                                      {"stopping_word", slot.stopping_word},
+                                  });
+
+        return slot.has_next_token; // continue
+    }
+
+    bool process_images(llama_client_slot &slot) const
+    {
+        for (slot_image &img : slot.images)
+        {
+            if (!img.request_encode_image)
+            {
+                continue;
+            }
+
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
+                LOG("Error processing the given image");
+                return false;
+            }
+
+            img.request_encode_image = false;
+        }
+
+        return slot.images.size() > 0;
+    }
+
+    void send_error(task_server& task, const std::string &error)
+    {
+        LOG("task %i - error: %s\n", task.id, error.c_str());
+        task_result res;
+        res.id = task.id;
+        res.multitask_id = task.multitask_id;
+        res.stop = false;
+        res.error = true;
+        res.result_json = { { "content", error } };
+        queue_results.send(res);
+    }
+
+    json get_formated_generation(llama_client_slot &slot)
+    {
+        std::vector<std::string> samplers;
+        samplers.reserve(slot.sparams.samplers_sequence.size());
+        for (const auto & sampler : slot.sparams.samplers_sequence)
+        {
+            samplers.emplace_back(llama_sampling_type_to_str(sampler));
+        }
+
+        return json {
+            {"n_ctx",             slot.n_ctx},
+            {"n_predict",         slot.n_predict},
+            {"model",             params.model_alias},
+            {"seed",              slot.params.seed},
+            {"temperature",       slot.sparams.temp},
+            {"dynatemp_range",    slot.sparams.dynatemp_range},
+            {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
+            {"top_k",             slot.sparams.top_k},
+            {"top_p",             slot.sparams.top_p},
+            {"min_p",             slot.sparams.min_p},
+            {"typical_p",         slot.sparams.typical_p},
+            {"repeat_last_n",     slot.sparams.penalty_last_n},
+            {"repeat_penalty",    slot.sparams.penalty_repeat},
+            {"presence_penalty",  slot.sparams.penalty_present},
+            {"frequency_penalty", slot.sparams.penalty_freq},
+            {"mirostat",          slot.sparams.mirostat},
+            {"mirostat_tau",      slot.sparams.mirostat_tau},
+            {"mirostat_eta",      slot.sparams.mirostat_eta},
+            {"stop",              slot.params.antiprompt},
+            {"n_predict",         slot.params.n_predict},
+            {"n_keep",            params.n_keep},
+            {"stream",            slot.params.stream},
+             //      {"logit_bias",        slot.sparams.logit_bias},
+            {"n_probs",           slot.sparams.n_probs},
+            {"min_keep",          slot.sparams.min_keep},
+            {"grammar",           slot.sparams.grammar},
+            {"samplers",          samplers}
+        };
+    }
+
+    void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
+    {
+        task_result res;
+        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
+        res.error = false;
+        res.stop = false;
+
+        res.result_json = json
+        {
+            {"content",    tkn.text_to_send},
+            {"stop",       false},
+            {"slot_id",    slot.id},
+            {"multimodal", multimodal}
+        };
+
+        if (slot.sparams.n_probs > 0)
+        {
+            std::vector<completion_token_output> probs_output = {};
+            const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
+            size_t probs_pos      = std::min(slot.sent_token_probs_index,                       slot.generated_token_probs.size());
+            size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
+            if (probs_pos < probs_stop_pos)
+            {
+                probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
+            }
+            slot.sent_token_probs_index = probs_stop_pos;
+            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
+        }
+
+        if (slot.oaicompat)
+        {
+            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
+            res.result_json["model"] = slot.oaicompat_model;
+        }
+
+        queue_results.send(res);
+    }
+
+    void send_final_response(llama_client_slot &slot)
+    {
+        task_result res;
+        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
+        res.error = false;
+        res.stop = true;
+
+        res.result_json = json
+        {
+            {"content",             !slot.params.stream ? slot.generated_text : ""},
+            {"slot_id",             slot.id},
+            {"stop",                true},
+            {"model",               params.model_alias},
+            {"tokens_predicted",    slot.n_decoded},
+            {"tokens_evaluated",    slot.num_prompt_tokens},
+            {"generation_settings", get_formated_generation(slot)},
+            {"prompt",              slot.prompt},
+            {"truncated",           slot.truncated},
+            {"stopped_eos",         slot.stopped_eos},
+            {"stopped_word",        slot.stopped_word},
+            {"stopped_limit",       slot.stopped_limit},
+            {"stopping_word",       slot.stopping_word},
+            {"tokens_cached",       slot.n_past},
+            {"timings",             slot.get_formated_timings()}
+        };
+
+        if (slot.sparams.n_probs > 0)
+        {
+            std::vector<completion_token_output> probs = {};
+            if (!slot.params.stream && slot.stopped_word)
+            {
+                const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
+                probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
+            }
+            else
+            {
+                probs = std::vector<completion_token_output>(
+                                    slot.generated_token_probs.begin(),
+                                    slot.generated_token_probs.end());
+            }
+            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
+        }
+
+        if (slot.oaicompat)
+        {
+            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
+            res.result_json["model"] = slot.oaicompat_model;
+        }
+
+        queue_results.send(res);
+    }
+
+    void send_embedding(llama_client_slot &slot, const llama_batch & batch)
+    {
+        task_result res;
+        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
+        res.error = false;
+        res.stop = true;
+
+        const int n_embd = llama_model_n_embd(model);
+        if (!params.embedding)
+        {
+            LOG_WARNING("embedding disabled", {
+                                                  {"params.embedding", params.embedding},
+                                              });
+            res.result_json = json
+            {
+                {"embedding", std::vector<float>(n_embd, 0.0f)},
+            };
+        }
+        else
+        {
+            const float *data = llama_get_embeddings(ctx);
+            std::vector<float> embd_res(n_embd, 0.0f);
+            std::vector<std::vector<float>> embedding;
+            for (int i = 0; i < batch.n_tokens; ++i) {
+                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                    continue;
+                }
+
+                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+                if (embd == NULL) {
+                    embd = llama_get_embeddings_ith(ctx, i);
+                }
+
+                if (embd == NULL) {
+                    LOG("failed to get embeddings");
+
+                    continue;
+                }
+
+                // normalize only when there is pooling
+                // TODO: configurable
+                if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
+                    common_embd_normalize(embd, embd_res.data(), n_embd, 2);
+                    embedding.push_back(embd_res);
+                } else {
+                    embedding.push_back({ embd, embd + n_embd });
+                }
+            }
+
+            // OAI compat
+            res.result_json = json
+            {
+                {"embedding", embedding[0] },
+            };
+        }
+        queue_results.send(res);
+    }
+
+    void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
+    {
+        task_server task;
+        task.id = task_id;
+        task.target_id = 0;
+        task.data = std::move(data);
+        task.infill_mode = infill;
+        task.embedding_mode = embedding;
+        task.type = TASK_TYPE_COMPLETION;
+        task.multitask_id = multitask_id;
+
+        // when a completion task's prompt array is not a singleton, we split it into multiple requests
+        // otherwise, it's a single-prompt task, we actually queue it
+        // if there's numbers in the prompt array it will be treated as an array of tokens
+        if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
+            bool numbers = false;
+            for (const auto& e : task.data.at("prompt")) {
+                if (e.is_number()) {
+                    numbers = true;
+                    break;
+                }
+            }
+
+            // NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
+            // it will completely stall the server. I don't know where the bug for this is.
+            //
+            // if there are numbers, it needs to be treated like a single prompt,
+            // queue_tasks handles a mix of strings and numbers just fine.
+            if (numbers) {
+                queue_tasks.post(task);
+            } else {
+                split_multiprompt_task(task_id, task);
+            }
+        } else {
+            queue_tasks.post(task);
+        }
+    }
+
+    // for multiple images processing
+    bool ingest_images(llama_client_slot &slot, int n_batch)
+    {
+        int image_idx = 0;
+
+        while (image_idx < (int) slot.images.size())
+        {
+            slot_image &img = slot.images[image_idx];
+
+            // process prefix prompt
+            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
+            {
+                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+                llama_batch batch_view = {
+                    n_tokens,
+                    batch.token    + i,
+                    nullptr,
+                    batch.pos      + i,
+                    batch.n_seq_id + i,
+                    batch.seq_id   + i,
+                    batch.logits   + i,
+                };
+                if (llama_decode(ctx, batch_view))
+                {
+                    LOG("%s : failed to eval\n", __func__);
+                    return false;
+                }
+            }
+
+            // process image with llm
+            for (int i = 0; i < img.image_tokens; i += n_batch)
+            {
+                int n_eval = img.image_tokens - i;
+                if (n_eval > n_batch)
+                {
+                    n_eval = n_batch;
+                }
+
+                const int n_embd = llama_model_n_embd(model);
+                float * embd = img.image_embedding + i * n_embd;
+                llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
+                if (llama_decode(ctx, llava_batch.batch))
+                {
+                    LOG("%s : failed to eval image\n", __func__);
+                    return false;
+                }
+                slot.n_past += n_eval;
+            }
+            image_idx++;
+
+            common_batch_clear(batch);
+
+            // append prefix of next image
+            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
+                slot.params.input_suffix : // no more images, then process suffix prompt
+                (json)(slot.images[image_idx].prefix_prompt);
+
+            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
+            for (int i = 0; i < (int) append_tokens.size(); ++i)
+            {
+                common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
+                slot.n_past += 1;
+            }
+        }
+
+        return true;
+    }
+
+    void request_cancel(int task_id)
+    {
+        task_server task;
+        task.type = TASK_TYPE_CANCEL;
+        task.target_id = task_id;
+        queue_tasks.post(task);
+    }
+
+    void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
+    {
+        int prompt_count = multiprompt_task.data.at("prompt").size();
+        if (prompt_count <= 1) {
+            send_error(multiprompt_task, "error while handling multiple prompts");
+            return;
+        }
+
+        // generate all the ID for subtask
+        std::vector<int> subtask_ids(prompt_count);
+        for (int i = 0; i < prompt_count; i++)
+        {
+            subtask_ids[i] = queue_tasks.get_new_id();
+        }
+
+        // queue up the multitask so we can track its subtask progression
+        queue_tasks.add_multitask(multitask_id, subtask_ids);
+
+        // add subtasks
+        for (int i = 0; i < prompt_count; i++)
+        {
+            json subtask_data = multiprompt_task.data;
+            subtask_data["prompt"] = subtask_data["prompt"][i];
+
+            // subtasks inherit everything else (infill mode, embedding mode, etc.)
+            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
+        }
+    }
+
+    void process_single_task(task_server& task)
+    {
+        switch (task.type)
+        {
+            case TASK_TYPE_COMPLETION: {
+                llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
+                if (slot == nullptr)
+                {
+                    // if no slot is available, we defer this task for processing later
+                    LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
+                    queue_tasks.defer(task);
+                    break;
+                }
+
+                if (task.data.contains("system_prompt"))
+                {
+                    if (!all_slots_are_idle) {
+                        send_error(task, "system prompt can only be updated when all slots are idle");
+                        break;
+                    }
+                    process_system_prompt_data(task.data["system_prompt"]);
+
+                    // reset cache_tokens for all slots
+                    for (llama_client_slot &slot : slots)
+                    {
+                        slot.cache_tokens.clear();
+                        slot.n_past    = 0;
+                        slot.n_past_se = 0;
+                    }
+                }
+
+                slot->reset();
+
+                slot->infill       = task.infill_mode;
+                slot->embedding    = task.embedding_mode;
+                slot->task_id      = task.id;
+                slot->multitask_id = task.multitask_id;
+
+                if (!launch_slot_with_data(slot, task.data))
+                {
+                    // send error result
+                    send_error(task, "internal_error");
+                    break;
+                }
+            } break;
+            case TASK_TYPE_CANCEL: { // release slot linked with the task id
+                for (auto & slot : slots)
+                {
+                    if (slot.task_id == task.target_id)
+                    {
+                        slot.release();
+                        break;
+                    }
+                }
+            } break;
+            case TASK_TYPE_NEXT_RESPONSE: {
+                // do nothing
+            } break;
+        }
+    }
+
+    void on_finish_multitask(task_multi& multitask)
+    {
+        // all subtasks done == multitask is done
+        task_result result;
+        result.id = multitask.id;
+        result.stop = true;
+        result.error = false;
+
+        // collect json results into one json result
+        std::vector<json> result_jsons;
+        for (auto& subres : multitask.results)
+        {
+            result_jsons.push_back(subres.result_json);
+            result.error = result.error && subres.error;
+        }
+        result.result_json = json{ { "results", result_jsons } };
+        queue_results.send(result);
+    }
+
+    bool update_slots() {
+        if (system_need_update)
+        {
+            LOG_INFO("updating system prompt", {});
+            update_system_prompt();
+        }
+
+        common_batch_clear(batch);
+
+        if (all_slots_are_idle)
+        {
+            if (system_prompt.empty() && clean_kv_cache)
+            {
+                LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
+                kv_cache_clear();
+            }
+            return true;
+        }
+
+        LOG_VERBOSE("posting NEXT_RESPONSE", {});
+        task_server task;
+        task.type = TASK_TYPE_NEXT_RESPONSE;
+        task.target_id = -1;
+        queue_tasks.post(task);
+
+        for (llama_client_slot &slot : slots)
+        {
+            if (slot.ga_n == 1)
+            {
+                if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
+                {
+                    // this check is redundant (for good)
+                    // we should never get here, because generation should already stopped in process_token()
+
+                    // START LOCALAI changes
+                    // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
+                    // See: https://github.com/mudler/LocalAI/issues/1333
+                    // Context is exhausted, release the slot
+                    slot.release();
+                    send_final_response(slot);
+                    slot.has_next_token = false;
+                    LOG_ERROR("context is exhausted, release the slot", {});
+
+                    continue;
+                    // END LOCALAI changes
+                }
+            }
+        }
+
+        // decode any currently ongoing sequences
+        LOG_VERBOSE("decoding ongoing sequences", {});
+        for (auto & slot : slots)
+        {
+            // release the slot
+            if (slot.command == RELEASE)
+            {
+                slot.state = IDLE;
+                slot.command = NONE;
+                slot.t_last_used = ggml_time_us();
+
+                LOG_INFO("slot released", {
+                    {"slot_id",         slot.id},
+                    {"task_id",         slot.task_id},
+                    {"n_ctx",           n_ctx},
+                    {"n_past",          slot.n_past},
+                    {"n_system_tokens", system_tokens.size()},
+                    {"n_cache_tokens",  slot.cache_tokens.size()},
+                    {"truncated",       slot.truncated}
+                });
+                queue_tasks.notify_slot_changed();
+
+                continue;
+            }
+
+            if (slot.state == IDLE)
+            {
+                continue;
+            }
+
+            slot.i_batch = batch.n_tokens;
+
+            const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
+
+            // TODO: we always have to take into account the "system_tokens"
+            //       this is not great and needs to be improved somehow
+            common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
+            slot.n_past += 1;
+        }
+
+        // process in chunks of params.n_batch
+        int32_t n_batch = params.n_batch;
+
+        // assign workload to the slots
+        if (params.cont_batching || batch.n_tokens == 0)
+        {
+            for (auto & slot : slots)
+            {
+                const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
+
+                // empty prompt passed -> release the slot and send empty response
+                // note: infill mode allows empty prompt
+                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill)
+                {
+                    slot.release();
+                    slot.print_timings();
+                    send_final_response(slot);
+                    continue;
+                }
+
+                // need process the prompt
+                if (slot.state == IDLE && slot.command == LOAD_PROMPT)
+                {
+                    slot.state = PROCESSING;
+                    slot.command = NONE;
+                    std::vector<llama_token> prompt_tokens;
+                    slot.t_start_process_prompt = ggml_time_us();
+                    slot.t_start_genereration = 0;
+
+                    if (slot.infill)
+                    {
+                        bool suff_rm_leading_spc = true;
+                        if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
+                        {
+                            params.input_suffix.erase(0, 1);
+                            suff_rm_leading_spc = false;
+                        }
+                        auto prefix_tokens = tokenize(slot.params.input_prefix, false);
+                        auto suffix_tokens = tokenize(slot.params.input_suffix, false);
+
+                        const int space_token = 29871; // TODO: this should not be hardcoded
+                        if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
+                            suffix_tokens.erase(suffix_tokens.begin());
+                        }
+
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS
+                        prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
+                        prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
+                        prefix_tokens.push_back(llama_token_middle(model));
+                        prompt_tokens = prefix_tokens;
+                    }
+                    else
+                    {
+                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt
+                    }
+
+                    slot.num_prompt_tokens = prompt_tokens.size();
+
+                    if (slot.params.n_keep < 0)
+                    {
+                        slot.params.n_keep = slot.num_prompt_tokens;
+                    }
+                    slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
+
+                    // if input prompt is too big, truncate it
+                    if (slot.num_prompt_tokens >= slot.n_ctx)
+                    {
+                        const int n_left = slot.n_ctx - slot.params.n_keep;
+                        const int n_block_size = n_left / 2;
+                        const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
+
+                        std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
+                        new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
+
+                        LOG_VERBOSE("input truncated", {
+                            {"n_ctx",  slot.n_ctx},
+                            {"n_keep", slot.params.n_keep},
+                            {"n_left", n_left},
+                            {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+                        });
+                        slot.truncated = true;
+                        prompt_tokens = new_tokens;
+
+                        slot.num_prompt_tokens = prompt_tokens.size();
+                        GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
+                    }
+
+                    if (!slot.params.cache_prompt)
+                    {
+                        common_sampler_reset(slot.ctx_sampling);
+
+                        slot.n_past = 0;
+                        slot.n_past_se = 0;
+                        slot.ga_i = 0;
+                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
+                    }
+                    else
+                    {
+                        // push the prompt into the sampling context (do not apply grammar)
+                        for (auto &token : prompt_tokens)
+                        {
+                            common_sampler_accept(slot.ctx_sampling, ctx, token, false);
+                        }
+
+                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
+
+                        // the last token of the cache is not in the KV cache until the next call to llama_decode
+                        // (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
+                        if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
+                        {
+                            slot.n_past -= 1;
+                        }
+
+                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
+
+                        if (slot.ga_n != 1)
+                        {
+                            int ga_i = 0;
+                            int32_t ga_n = slot.ga_n;
+                            int32_t ga_w = slot.ga_w;
+                            int32_t slot_npast = 0;
+                            for (int k = 0; k < slot.n_past; ++k)
+                            {
+                                while (slot_npast >= ga_i + ga_w) {
+                                    const int bd = (ga_w/ga_n)*(ga_n - 1);
+                                    slot_npast -= bd;
+                                    ga_i += ga_w/ga_n;
+                                }
+                                slot_npast++;
+                            }
+                            slot.n_past_se = slot_npast;
+                            slot.ga_i = ga_i;
+                        }
+
+                        LOG_INFO("slot progression", {
+                            { "slot_id", slot.id },
+                            { "task_id", slot.task_id },
+                            { "n_past",  slot.n_past },
+                            { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
+                        });
+                    }
+
+                    slot.cache_tokens = prompt_tokens;
+
+                    if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
+                    {
+                        // we have to evaluate at least 1 token to generate logits.
+                        LOG_INFO("we have to evaluate at least 1 token to generate logits", {
+                            { "slot_id", slot.id },
+                            { "task_id", slot.task_id }
+                        });
+                        slot.n_past--;
+                        if (slot.ga_i > 0)
+                        {
+                            slot.n_past_se--;
+                        }
+                    }
+
+                    int p0 = (int) system_tokens.size() + slot.n_past;
+                    LOG_INFO("kv cache rm [p0, end)", {
+                        { "slot_id", slot.id },
+                        { "task_id", slot.task_id },
+                        { "p0",      p0 }
+                    });
+                    llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
+
+                    LOG_VERBOSE("prompt ingested", {
+                                                    {"n_past",  slot.n_past},
+                                                    {"cached",  tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
+                                                    {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
+                                                });
+
+                    const bool has_images = process_images(slot);
+
+                    // process the prefix of first image
+                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
+
+                    int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
+
+                    int32_t ga_i = slot.ga_i;
+                    int32_t ga_n = slot.ga_n;
+                    int32_t ga_w = slot.ga_w;
+
+                    for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
+                    {
+                        if (slot.ga_n != 1)
+                        {
+                            while (slot_npast >= ga_i + ga_w) {
+                                const int bd = (ga_w/ga_n)*(ga_n - 1);
+                                slot_npast -= bd;
+                                ga_i += ga_w/ga_n;
+                            }
+                        }
+                        common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
+                        slot_npast++;
+                    }
+
+                    if (has_images && !ingest_images(slot, n_batch))
+                    {
+                        LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d",
+                            __func__,
+                            slot.id,
+                            slot.task_id
+                        );
+                        // FIXME @phymbert: to be properly tested
+                        //  early returning without changing the slot state will block the slot for ever
+                        // no one at the moment is checking the return value
+                        return false;
+                    }
+
+                    // extract the logits only for the last token
+                    if (batch.n_tokens > 0)
+                    {
+                        batch.logits[batch.n_tokens - 1] = true;
+                    }
+
+                    slot.n_decoded = 0;
+                    slot.i_batch   = batch.n_tokens - 1;
+                }
+            }
+        }
+
+        if (batch.n_tokens == 0)
+        {
+            all_slots_are_idle = true;
+            return true;
+        }
+
+        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
+        {
+            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+
+            for (auto & slot : slots)
+            {
+                if (slot.ga_n != 1)
+                {
+                    // context extension via Self-Extend
+                    while (slot.n_past_se >= slot.ga_i + slot.ga_w)
+                    {
+                        const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
+                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
+                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
+
+                        LOG("\n");
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+
+                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
+                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
+                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
+
+                        slot.n_past_se -= bd;
+
+                        slot.ga_i += slot.ga_w / slot.ga_n;
+
+                        LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                    }
+                    slot.n_past_se += n_tokens;
+                }
+            }
+
+            llama_batch batch_view =
+            {
+                n_tokens,
+                batch.token    + i,
+                nullptr,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
+            };
+
+            const int ret = llama_decode(ctx, batch_view);
+
+            if (ret != 0)
+            {
+                if (n_batch == 1 || ret < 0)
+                {
+                    // if you get here, it means the KV cache is full - try increasing it via the context size
+                    LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    return false;
+                }
+
+                LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+
+                // retry with half the batch size to try to find a free slot in the KV cache
+                n_batch /= 2;
+                i -= n_batch;
+                continue;
+            }
+
+            for (auto & slot : slots)
+            {
+                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
+                {
+                    continue;
+                }
+
+                // prompt evaluated for embedding
+                if (slot.embedding)
+                {
+                    send_embedding(slot, batch_view);
+                    slot.release();
+                    slot.i_batch = -1;
+                    continue;
+                }
+
+                completion_token_output result;
+                const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
+
+                common_sampler_accept(slot.ctx_sampling, ctx, id, true);
+
+                slot.n_decoded += 1;
+                if (slot.n_decoded == 1)
+                {
+                    slot.t_start_genereration = ggml_time_us();
+                    slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
+                    metrics.on_prompt_eval(slot);
+                }
+
+                result.tok = id;
+                const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);
+
+                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
+                    result.probs.push_back({
+                        cur_p->data[i].id,
+                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
+                    });
+                }
+
+                if (!process_token(result, slot))
+                {
+                    slot.release();
+                    slot.print_timings();
+                    send_final_response(slot);
+                    metrics.on_prediction(slot);
+                }
+
+                slot.i_batch = -1;
+            }
+        }
+
+        LOG_VERBOSE("slots updated", {});
+        return true;
+    }
+
+    void run_on_all_tasks_finished() {
+        update_slots();
+    }
+};
+
+/* llama.cpp completion api semantics */
+static json format_partial_response(
+    llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
+) {
+    json res = json
+    {
+        {"content",    content },
+        {"stop",       false},
+        {"slot_id",    slot->id },
+        {"multimodal", llama.multimodal }
+    };
+
+    if (slot->sparams.n_probs > 0)
+    {
+        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
+    }
+
+    return res;
+}
+
+struct token_translator
+{
+    llama_context * ctx;
+    std::string operator()(llama_token tok)                    const { return common_token_to_piece(ctx, tok); }
+    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
+};
+
+static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot)
+{
+    auto & gtps = slot->generated_token_probs;
+    auto translator = token_translator{llama.ctx};
+    auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
+    const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
+    if (slot->generated_text.capacity() < slot->generated_text.size() + len)
+    {
+        slot->generated_text.reserve(slot->generated_text.size() + len);
+    }
+    for (const completion_token_output & cto : gtps)
+    {
+        slot->generated_text += translator(cto);
+    }
+}
+
+std::function<void(int)> shutdown_handler;
+
+inline void signal_handler(int signal) {
+    exit(1);
+}
+
+
+/////////////////////////////////
+////////////////////////////////
+//////// LOCALAI code starts below here
+/////////////////////////////////
+////////////////////////////////
+
+bool loaded_model; // TODO: add a mutex for this, but happens only once loading the model
+
+// The class has a llama instance that is shared across all RPCs
+llama_server_context llama;
+
+static void start_llama_server() {
+    // Wait for model to be loaded first
+    while (!loaded_model) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+
+    llama.queue_tasks.on_new_task(std::bind(
+        &llama_server_context::process_single_task, &llama, std::placeholders::_1));
+    llama.queue_tasks.on_finish_multitask(std::bind(
+        &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
+    llama.queue_tasks.on_all_tasks_finished(std::bind(
+        &llama_server_context::run_on_all_tasks_finished, &llama));
+    llama.queue_results.on_multitask_update(std::bind(
+        &llama_server_queue::update_multitask,
+        &llama.queue_tasks,
+        std::placeholders::_1,
+        std::placeholders::_2,
+        std::placeholders::_3
+    ));
+    llama.queue_tasks.start_loop();
+}
+
+json parse_options(bool streaming, const backend::PredictOptions* predict, llama_server_context &llama)
+{
+
+    // This is for example a slot data from the json data
+    //     slot->params.stream           = json_value(data, "stream",            false);
+    //     slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
+    //     slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
+    //     slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
+    //     slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
+    //     slot->sparams.typical_p       = json_value(data, "typical_p",         default_sparams.typical_p);
+    //     slot->sparams.temp            = json_value(data, "temperature",       default_sparams.temp);
+    //     slot->sparams.penalty_last_n  = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
+    //     slot->sparams.penalty_repeat  = json_value(data, "repeat_penalty",    default_sparams.penalty_repeat);
+    //     slot->sparams.penalty_freq    = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
+    //     slot->sparams.penalty_present = json_value(data, "presence_penalty",  default_sparams.penalty_present);
+    //     slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
+    //     slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
+    //     slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
+    //     slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
+    //     slot->params.seed             = json_value(data, "seed",              default_params.seed);
+    //     slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
+    //     slot->sparams.n_probs         = json_value(data, "n_probs",           default_sparams.n_probs);
+
+    // Create now a json data from the prediction options instead
+    //
+    json data;
+    data["stream"] = streaming;
+    data["cache_prompt"] = predict->promptcacheall();
+    data["n_predict"] = predict->tokens() == 0 ? -1 : predict->tokens();
+    data["top_k"] = predict->topk();
+    data["top_p"] = predict->topp();
+    data["typical_p"] = predict->typicalp();
+    data["temperature"] = predict->temperature();
+    data["repeat_last_n"] = predict->repeat();
+    data["repeat_penalty"] = predict->penalty();
+    data["frequency_penalty"] = predict->frequencypenalty();
+    data["presence_penalty"] = predict->presencepenalty();
+    data["mirostat"] = predict->mirostat();
+    data["mirostat_tau"] = predict->mirostattau();
+    data["mirostat_eta"] = predict->mirostateta();
+    data["n_keep"] = predict->nkeep();
+    data["seed"] = predict->seed();
+    data["grammar"] = predict->grammar();
+    data["prompt"] = predict->prompt();
+    data["ignore_eos"] = predict->ignoreeos();
+    data["embeddings"] = predict->embeddings();
+
+    // Add the correlationid to json data
+    data["correlation_id"] = predict->correlationid();
+
+    // for each image in the request, add the image data
+    //
+    for (int i = 0; i < predict->images_size(); i++) {
+        data["image_data"].push_back(json
+            {
+                {"id", i},
+                {"data",    predict->images(i)},
+            });
+    }
+
+    data["stop"] = predict->stopprompts();
+    // data["n_probs"] = predict->nprobs();
+    //TODO: images,
+
+    return data;
+}
+
+// static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama)
+// {
+//     // https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L673
+//     gpt_params default_params;
+
+//     llama.stream = streaming;
+//     llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
+//     llama.params.sparams.top_k = predict->topk();
+//     llama.params.sparams.top_p = predict->topp();
+//     llama.params.sparams.typical_p = predict->typicalp();
+//     llama.params.sparams.penalty_last_n = predict->repeat();
+//     llama.params.sparams.temp = predict->temperature();
+//     llama.params.sparams.penalty_repeat = predict->penalty();
+//     llama.params.sparams.penalty_present = predict->presencepenalty();
+//     llama.params.sparams.penalty_freq = predict->frequencypenalty();
+//     llama.params.sparams.mirostat = predict->mirostat();
+//     llama.params.sparams.mirostat_tau = predict->mirostattau();
+//     llama.params.sparams.mirostat_eta = predict->mirostateta();
+//     llama.params.n_keep = predict->nkeep();
+//     llama.params.seed = predict->seed();
+//     llama.params.sparams.grammar = predict->grammar();
+//     // llama.params.n_probs = predict->
+//     llama.params.prompt = predict->prompt();
+
+//     llama.params.sparams.logit_bias.clear();
+
+//     if (predict->ignoreeos())
+//     {
+//         llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
+//     }
+
+//     // const auto &logit_bias = body.find("logit_bias");
+//     // if (logit_bias != body.end() && logit_bias->is_array())
+//     // {
+//     //     const int n_vocab = llama_n_vocab(llama.model);
+//     //     for (const auto &el : *logit_bias)
+//     //     {
+//     //         if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
+//     //         {
+//     //             llama_token tok = el[0].get<llama_token>();
+//     //             if (tok >= 0 && tok < n_vocab)
+//     //             {
+//     //                 if (el[1].is_number())
+//     //                 {
+//     //                     llama.params.logit_bias[tok] = el[1].get<float>();
+//     //                 }
+//     //                 else if (el[1].is_boolean() && !el[1].get<bool>())
+//     //                 {
+//     //                     llama.params.logit_bias[tok] = -INFINITY;
+//     //                 }
+//     //             }
+//     //         }
+//     //     }
+//     // }
+
+//     llama.params.antiprompt.clear();
+//     for (const std::string& stopPrompt : predict->stopprompts()) {
+//     if (!stopPrompt.empty())
+//             {
+//                 llama.params.antiprompt.push_back(stopPrompt);
+//             }
+//     }
+// }
+
+static void params_parse(const backend::ModelOptions* request,
+                                gpt_params & params, llama_server_context &llama) {
+
+    // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
+
+    params.model = request->modelfile();
+    if (!request->mmproj().empty()) {
+    // get the directory of modelfile
+      std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
+      params.mmproj.path = model_dir + "/"+ request->mmproj();
+    }
+    //  params.model_alias ??
+    params.model_alias =  request->modelfile();
+    if (!request->cachetypekey().empty()) {
+        params.cache_type_k = request->cachetypekey();
+    }
+    if (!request->cachetypevalue().empty()) {
+        params.cache_type_v = request->cachetypevalue();
+    }
+    params.n_ctx = request->contextsize();
+    //params.memory_f16 = request->f16memory();
+    params.n_threads = request->threads();
+    params.n_gpu_layers = request->ngpulayers();
+    params.n_batch = request->nbatch();
+    // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
+    //params.n_parallel = 1;
+    const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
+    if (env_parallel != NULL) {
+        params.n_parallel = std::stoi(env_parallel);
+        params.cont_batching = true;
+    } else {
+        params.n_parallel = 1;
+    }
+
+    const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
+    if (llama_grpc_servers != NULL) {
+        add_rpc_devices(std::string(llama_grpc_servers));
+    }
+
+     // decode options. Options are in form optname:optvale, or if booleans only optname.
+    for (int i = 0; i < request->options_size(); i++) {
+        std::string opt = request->options(i);
+        char *optname = strtok(&opt[0], ":");
+        char *optval = strtok(NULL, ":");
+        if (optval == NULL) {
+            optval = "true";
+        }
+
+        if (!strcmp(optname, "gpu")) {
+            llama.has_gpu = true;
+        }
+    }
+
+    // TODO: Add yarn
+
+    if (!request->tensorsplit().empty()) {
+        std::string arg_next = request->tensorsplit();
+
+        // split string by , and /
+        const std::regex regex{ R"([,/]+)" };
+        std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+        std::vector<std::string> split_arg{ it, {} };
+
+        GGML_ASSERT(split_arg.size() <= llama_max_devices());
+
+        for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
+            if (i_device < split_arg.size()) {
+                params.tensor_split[i_device] = std::stof(split_arg[i_device]);
+            }
+            else {
+                params.tensor_split[i_device] = 0.0f;
+            }
+        }
+    }
+
+    if (!request->maingpu().empty()) {
+        params.main_gpu = std::stoi(request->maingpu());
+    }
+    if (!request->loraadapter().empty() && !request->lorabase().empty()) {
+     float scale_factor = 1.0f;
+     if (request->lorascale() != 0.0f) {
+        scale_factor = request->lorascale();
+     }
+     // get the directory of modelfile
+     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
+     params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
+    }
+    params.use_mlock = request->mlock();
+    params.use_mmap = request->mmap();
+    params.flash_attn = (request->flashattention() == "true" || request->flashattention() == "1");
+    params.no_kv_offload = request->nokvoffload();
+    params.ctx_shift = false; // We control context-shifting in any case (and we disable it as it could just lead to infinite loops)
+
+    params.embedding = request->embeddings();
+
+    if (request->ropescaling() == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+    else if (request->ropescaling() == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+    else { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
+    if ( request->yarnextfactor() != 0.0f ) {
+        params.yarn_ext_factor = request->yarnextfactor();
+    }
+    if ( request->yarnattnfactor() != 0.0f ) {
+        params.yarn_attn_factor = request->yarnattnfactor();
+    }
+    if ( request->yarnbetafast() != 0.0f ) {
+        params.yarn_beta_fast = request->yarnbetafast();
+    }
+    if ( request->yarnbetaslow() != 0.0f ) {
+        params.yarn_beta_slow = request->yarnbetaslow();
+    }
+    if ( request->ropefreqbase() != 0.0f ) {
+        params.rope_freq_base = request->ropefreqbase();
+    }
+    if ( request->ropefreqscale() != 0.0f ) {
+        params.rope_freq_scale = request->ropefreqscale();
+    }
+
+    if (request->grammartriggers_size() > 0) {
+        LOG_INFO("configuring grammar triggers", {});
+        llama.grammar_lazy = true;
+        for (int i = 0; i < request->grammartriggers_size(); i++) {
+            common_grammar_trigger trigger;
+	    trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_WORD;
+            trigger.value = request->grammartriggers(i).word();
+	    // trigger.at_start = request->grammartriggers(i).at_start();
+            llama.grammar_triggers.push_back(trigger);
+            LOG_INFO("grammar trigger", {
+                { "word", trigger.value },
+            });
+        }
+    }
+}
+
+
+// GRPC Server start
+class BackendServiceImpl final : public backend::Backend::Service {
+public:
+  grpc::Status Health(ServerContext* context, const backend::HealthMessage* request, backend::Reply* reply) {
+    // Implement Health RPC
+    reply->set_message("OK");
+    return Status::OK;
+  }
+
+  grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
+    // Implement LoadModel RPC
+    gpt_params params;
+    params_parse(request, params, llama);
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // load the model
+    if (!llama.load_model(params))
+    {
+        result->set_message("Failed loading model");
+        result->set_success(false);
+        return Status::CANCELLED;
+    }
+    llama.initialize();
+    result->set_message("Loading succeeded");
+    result->set_success(true);
+    loaded_model = true;
+    return Status::OK;
+  }
+  grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
+        json data = parse_options(true, request, llama);
+        const int task_id = llama.queue_tasks.get_new_id();
+        llama.queue_results.add_waiting_task_id(task_id);
+        llama.request_completion(task_id, data, false, false, -1);
+        while (true)
+        {
+            task_result result = llama.queue_results.recv(task_id);
+            if (!result.error) {
+                const std::string str =
+                "data: " +
+                result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
+                "\n\n";
+                LOG_VERBOSE("data stream", {
+                    { "to_send", str }
+                });
+
+                backend::Reply reply;
+                // print it
+                std::string completion_text = result.result_json.value("content", "");
+
+                reply.set_message(completion_text);
+                int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
+                reply.set_tokens(tokens_predicted);
+                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
+                reply.set_prompt_tokens(tokens_evaluated);
+
+                if (result.result_json.contains("timings")) {
+                    double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
+                    reply.set_timing_prompt_processing(timing_prompt_processing);
+                    double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
+                    reply.set_timing_token_generation(timing_token_generation);
+                }
+
+                // Log Request Correlation Id
+                LOG_VERBOSE("correlation:", {
+                    { "id", data["correlation_id"] }
+                });
+
+                // Send the reply
+                writer->Write(reply);
+
+                if (result.stop) {
+                    break;
+                }
+            } else {
+                break;
+            }
+        }
+
+        return grpc::Status::OK;
+    }
+
+
+    grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
+        json data = parse_options(false, request, llama);
+        const int task_id = llama.queue_tasks.get_new_id();
+        llama.queue_results.add_waiting_task_id(task_id);
+        llama.request_completion(task_id, data, false, false, -1);
+        std::string completion_text;
+        task_result result = llama.queue_results.recv(task_id);
+        if (!result.error && result.stop) {
+
+            // Log Request Correlation Id
+            LOG_VERBOSE("correlation:", {
+                { "id", data["correlation_id"] }
+            });
+
+            completion_text = result.result_json.value("content", "");
+            int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
+            int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
+            reply->set_prompt_tokens(tokens_evaluated);
+            reply->set_tokens(tokens_predicted);
+            reply->set_message(completion_text);
+
+            if (result.result_json.contains("timings")) {
+                double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
+                reply->set_timing_prompt_processing(timing_prompt_processing);
+                double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
+                reply->set_timing_token_generation(timing_token_generation);
+            }
+        }
+        else
+        {
+            return grpc::Status::OK;
+        }
+
+        return grpc::Status::OK;
+    }
+
+    /// https://github.com/ggerganov/llama.cpp/blob/aa2341298924ac89778252015efcb792f2df1e20/examples/server/server.cpp#L2969
+    grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) {
+        json data = parse_options(false, request, llama);
+        const int task_id = llama.queue_tasks.get_new_id();
+        llama.queue_results.add_waiting_task_id(task_id);
+        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1);
+        // get the result
+        task_result result = llama.queue_results.recv(task_id);
+        //std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl;
+        llama.queue_results.remove_waiting_task_id(task_id);
+        if (!result.error && result.stop) {
+            std::vector<float> embeddings = result.result_json.value("embedding", std::vector<float>());
+            // loop the vector and set the embeddings results
+            for (int i = 0; i < embeddings.size(); i++) {
+                embeddingResult->add_embeddings(embeddings[i]);
+            }
+        }
+        else
+        {
+            return grpc::Status::OK;
+        }
+
+        return grpc::Status::OK;
+    }
+
+    grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
+         json data = parse_options(false, request, llama);
+
+         std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
+
+         for (int i=0 ; i< tokens.size(); i++){
+            response->add_tokens(tokens[i]);
+         }
+
+        return grpc::Status::OK;
+    }
+
+    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
+        llama_client_slot* active_slot = llama.get_active_slot();
+
+        if (active_slot != nullptr) {
+            // Calculate the tokens per second using existing logic
+            double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded;
+
+            // Populate the response with metrics
+            response->set_slot_id(active_slot->id);
+            response->set_prompt_json_for_slot(active_slot->prompt.dump());
+            response->set_tokens_per_second(tokens_per_second);
+            response->set_tokens_generated(active_slot->n_decoded);
+            response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed);
+        } else {
+            // Handle case when no active slot exists
+            response->set_slot_id(0);
+            response->set_prompt_json_for_slot("");
+            response->set_tokens_per_second(0);
+            response->set_tokens_generated(0);
+            response->set_prompt_tokens_processed(0);
+        }
+
+        return grpc::Status::OK;
+    }
+};
+
+void RunServer(const std::string& server_address) {
+  BackendServiceImpl service;
+
+  ServerBuilder builder;
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  builder.RegisterService(&service);
+
+  std::unique_ptr<Server> server(builder.BuildAndStart());
+  std::cout << "Server listening on " << server_address << std::endl;
+  server->Wait();
+}
+
+int main(int argc, char** argv) {
+  std::string server_address("localhost:50051");
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = signal_handler;
+    sigemptyset (&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+    sigaction(SIGTERM, &sigint_action, NULL);
+#elif defined (_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
+  // Define long and short options
+  struct option long_options[] = {
+      {"addr", required_argument, nullptr, 'a'},
+      {nullptr, 0, nullptr, 0}
+  };
+
+  // Parse command-line arguments
+  int option;
+  int option_index = 0;
+  while ((option = getopt_long(argc, argv, "a:", long_options, &option_index)) != -1) {
+    switch (option) {
+      case 'a':
+        server_address = optarg;
+        break;
+      default:
+        std::cerr << "Usage: " << argv[0] << " [--addr=<address>] or [-a <address>]" << std::endl;
+        return 1;
+    }
+  }
+
+   // run the HTTP server in a thread - see comment below
+    std::thread t([&]()
+            {
+                RunServer(server_address);
+                return 0;
+            });
+
+
+    //);
+    start_llama_server();
+                        std::cout << "stopping" << std::endl;
+
+    t.join();
+
+    llama_backend_free();
+  return 0;
+}
diff --git a/backend/cpp/ik-llama-cpp/package.sh b/backend/cpp/ik-llama-cpp/package.sh
new file mode 100644
index 000000000..56d430563
--- /dev/null
+++ b/backend/cpp/ik-llama-cpp/package.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+# This script is used in the final stage of the Dockerfile
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+REPO_ROOT="${CURDIR}/../../.."
+
+# Create lib directory
+mkdir -p $CURDIR/package/lib
+
+cp -avrf $CURDIR/ik-llama-cpp-* $CURDIR/package/
+cp -rfv $CURDIR/run.sh $CURDIR/package/
+
+# Detect architecture and copy appropriate libraries
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    # x86_64 architecture
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    # ARM64 architecture
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+# Package GPU libraries based on BUILD_TYPE
+# The GPU library packaging script will detect BUILD_TYPE and copy appropriate GPU libraries
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
+ls -liah $CURDIR/package/
+ls -liah $CURDIR/package/lib/
diff --git a/backend/cpp/ik-llama-cpp/patches/0001-fix-missing-cstdint-include.patch b/backend/cpp/ik-llama-cpp/patches/0001-fix-missing-cstdint-include.patch
new file mode 100644
index 000000000..ef5710fc4
--- /dev/null
+++ b/backend/cpp/ik-llama-cpp/patches/0001-fix-missing-cstdint-include.patch
@@ -0,0 +1,10 @@
+--- a/ggml/src/iqk/iqk_common.h
++++ b/ggml/src/iqk/iqk_common.h
+@@ -9,6 +9,7 @@
+ #pragma once
+
+ #include "iqk_config.h"
++#include <cstdint>
+
+ #if defined IQK_IMPLEMENT
+
diff --git a/backend/cpp/ik-llama-cpp/prepare.sh b/backend/cpp/ik-llama-cpp/prepare.sh
new file mode 100644
index 000000000..fb0ba7624
--- /dev/null
+++ b/backend/cpp/ik-llama-cpp/prepare.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+## Patches
+
+## Apply patches from the `patches` directory
+if [ -d "patches" ]; then
+    for patch in $(ls patches); do
+        echo "Applying patch $patch"
+        patch -d llama.cpp/ -p1 < patches/$patch
+    done
+fi
+
+set -e
+
+cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
+cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
+cp -r utils.hpp llama.cpp/examples/grpc-server/
+cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/examples/grpc-server/
+
+## Copy clip/llava files for multimodal support (built as myclip library)
+cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
+# Prepend llama.h include to llava.h
+echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
+cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
+# Copy clip-impl.h if it exists
+if [ -f llama.cpp/examples/llava/clip-impl.h ]; then
+    cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
+fi
+# Copy stb_image.h
+if [ -f llama.cpp/vendor/stb/stb_image.h ]; then
+    cp -rfv llama.cpp/vendor/stb/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
+elif [ -f llama.cpp/common/stb_image.h ]; then
+    cp -rfv llama.cpp/common/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
+fi
+
+## Fix API compatibility in llava.cpp (llama_n_embd -> llama_model_n_embd)
+if [ -f llama.cpp/examples/grpc-server/llava.cpp ]; then
+    sed -i 's/llama_n_embd(/llama_model_n_embd(/g' llama.cpp/examples/grpc-server/llava.cpp
+fi
+
+set +e
+if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
+    echo "grpc-server already added"
+else
+    echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+fi
+set -e
diff --git a/backend/cpp/ik-llama-cpp/run.sh b/backend/cpp/ik-llama-cpp/run.sh
new file mode 100644
index 000000000..1c4ee2a69
--- /dev/null
+++ b/backend/cpp/ik-llama-cpp/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+set -ex
+
+# Get the absolute current dir where the script is located
+CURDIR=$(dirname "$(realpath $0)")
+
+cd /
+
+echo "CPU info:"
+grep -e "model\sname" /proc/cpuinfo | head -1
+grep -e "flags" /proc/cpuinfo | head -1
+
+# ik_llama.cpp requires AVX2 — default to avx2 binary
+BINARY=ik-llama-cpp-avx2
+
+if [ -e $CURDIR/ik-llama-cpp-fallback ] && ! grep -q -e "\savx2\s" /proc/cpuinfo ; then
+	echo "CPU:    AVX2   NOT found, using fallback"
+	BINARY=ik-llama-cpp-fallback
+fi
+
+# Extend ld library path with the dir where this script is located/lib
+if [ "$(uname)" == "Darwin" ]; then
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+	#export DYLD_FALLBACK_LIBRARY_PATH=$CURDIR/lib:$DYLD_FALLBACK_LIBRARY_PATH
+else
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+fi
+
+# If there is a lib/ld.so, use it
+if [ -f $CURDIR/lib/ld.so ]; then
+	echo "Using lib/ld.so"
+	echo "Using binary: $BINARY"
+	exec $CURDIR/lib/ld.so $CURDIR/$BINARY "$@"
+fi
+
+echo "Using binary: $BINARY"
+exec $CURDIR/$BINARY "$@"
+
+# We should never reach this point, however just in case we do, run fallback
+exec $CURDIR/ik-llama-cpp-fallback "$@"
diff --git a/backend/cpp/ik-llama-cpp/utils.hpp b/backend/cpp/ik-llama-cpp/utils.hpp
new file mode 100644
index 000000000..e5cf2a009
--- /dev/null
+++ b/backend/cpp/ik-llama-cpp/utils.hpp
@@ -0,0 +1,483 @@
+// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <set>
+#include <mutex>
+#include <condition_variable>
+#include <unordered_map>
+
+#include "json.hpp"
+
+#include "clip.h"
+
+using json = nlohmann::json;
+
+extern bool server_verbose;
+
+#ifndef SERVER_VERBOSE
+#define SERVER_VERBOSE 1
+#endif
+
+#if SERVER_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                            \
+    do                                                                   \
+    {                                                                    \
+        if (server_verbose)                                              \
+        {                                                                \
+            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
+        }                                                                \
+    } while (0)
+#endif
+
+#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
+
+//
+// parallel
+//
+
+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+    SERVER_STATE_ERROR           // An error occurred, load_model failed
+};
+
+enum task_type {
+    TASK_TYPE_COMPLETION,
+    TASK_TYPE_CANCEL,
+    TASK_TYPE_NEXT_RESPONSE
+};
+
+struct task_server {
+    int id = -1; // to be filled by llama_server_queue
+    int target_id;
+    task_type type;
+    json data;
+    bool infill_mode = false;
+    bool embedding_mode = false;
+    int multitask_id = -1;
+};
+
+struct task_result {
+    int id;
+    int multitask_id = -1;
+    bool stop;
+    bool error;
+    json result_json;
+};
+
+struct task_multi {
+    int id;
+    std::set<int> subtasks_remaining{};
+    std::vector<task_result> results{};
+};
+
+// TODO: can become bool if we can't find use of more states
+enum slot_state
+{
+    IDLE,
+    PROCESSING,
+};
+
+enum slot_command
+{
+    NONE,
+    LOAD_PROMPT,
+    RELEASE,
+};
+
+struct slot_params
+{
+    bool stream       = true;
+    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+
+    uint32_t seed      = -1; // RNG seed
+    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t  n_predict = -1; // new tokens to predict
+
+    std::vector<std::string> antiprompt;
+
+    json input_prefix;
+    json input_suffix;
+};
+
+struct slot_image
+{
+    int32_t id;
+
+    bool request_encode_image = false;
+    float * image_embedding = nullptr;
+    int32_t image_tokens = 0;
+
+    clip_image_u8 * img_data;
+
+    std::string prefix_prompt; // before of this image
+};
+
+// completion token output with probabilities
+struct completion_token_output
+{
+    struct token_prob
+    {
+        llama_token tok;
+        float prob;
+    };
+
+    std::vector<token_prob> probs;
+    llama_token tok;
+    std::string text_to_send;
+};
+
+static inline void server_log(const char *level, const char *function, int line,
+                       const char *message, const nlohmann::ordered_json &extra)
+{
+    nlohmann::ordered_json log
+    {
+        {"timestamp", time(nullptr)},
+        {"level",     level},
+        {"function",  function},
+        {"line",      line},
+        {"message",   message},
+    };
+
+    if (!extra.empty())
+    {
+        log.merge_patch(extra);
+    }
+
+    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
+    printf("%.*s\n", (int)str.size(), str.data());
+    fflush(stdout);
+}
+
+//
+// server utils
+//
+
+template <typename T>
+static T json_value(const json &body, const std::string &key, const T &default_value)
+{
+    // Fallback null to default value
+    return body.contains(key) && !body.at(key).is_null()
+        ? body.value(key, default_value)
+        : default_value;
+}
+
+inline std::string format_chatml(std::vector<json> messages)
+{
+    std::ostringstream chatml_msgs;
+
+    for (auto it = messages.begin(); it != messages.end(); ++it) {
+        chatml_msgs << "<|im_start|>"
+                    << json_value(*it, "role",    std::string("user")) << '\n';
+        chatml_msgs << json_value(*it, "content", std::string(""))
+                    << "<|im_end|>\n";
+    }
+
+    chatml_msgs << "<|im_start|>assistant" << '\n';
+
+    return chatml_msgs.str();
+}
+
+//
+// work queue utils
+//
+
+struct llama_server_queue {
+    int id = 0;
+    std::mutex mutex_tasks;
+    // queues
+    std::vector<task_server> queue_tasks;
+    std::vector<task_server> queue_tasks_deferred;
+    std::vector<task_multi> queue_multitasks;
+    std::condition_variable condition_tasks;
+    // callback functions
+    std::function<void(task_server&)> callback_new_task;
+    std::function<void(task_multi&)> callback_finish_multitask;
+    std::function<void(void)> callback_all_task_finished;
+
+    // Add a new task to the end of the queue
+    int post(task_server task) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        if (task.id == -1) {
+            task.id = id++;
+        }
+        queue_tasks.push_back(std::move(task));
+        condition_tasks.notify_one();
+        return task.id;
+    }
+
+    // Add a new task, but defer until one slot is available
+    void defer(task_server task) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        queue_tasks_deferred.push_back(std::move(task));
+    }
+
+    // Get the next id for creating anew task
+    int get_new_id() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        return id++;
+    }
+
+    // Register function to process a new task
+    void on_new_task(std::function<void(task_server&)> callback) {
+        callback_new_task = callback;
+    }
+
+    // Register function to process a multitask
+    void on_finish_multitask(std::function<void(task_multi&)> callback) {
+        callback_finish_multitask = callback;
+    }
+
+    // Register the function to be called when the batch of tasks is finished
+    void on_all_tasks_finished(std::function<void(void)> callback) {
+        callback_all_task_finished = callback;
+    }
+
+    // Call when the state of one slot is changed
+    void notify_slot_changed() {
+        // move deferred tasks back to main loop
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        for (auto & task : queue_tasks_deferred) {
+            queue_tasks.push_back(std::move(task));
+        }
+        queue_tasks_deferred.clear();
+    }
+
+    // Start the main loop. This call is blocking
+    [[noreturn]]
+    void start_loop() {
+        while (true) {
+            // new task arrived
+            LOG_VERBOSE("have new task", {});
+            {
+                while (true)
+                {
+                    std::unique_lock<std::mutex> lock(mutex_tasks);
+                    if (queue_tasks.empty()) {
+                        lock.unlock();
+                        break;
+                    }
+                    task_server task = queue_tasks.front();
+                    queue_tasks.erase(queue_tasks.begin());
+                    lock.unlock();
+                    LOG_VERBOSE("callback_new_task", {});
+                    callback_new_task(task);
+                }
+                LOG_VERBOSE("callback_all_task_finished", {});
+                // process and update all the multitasks
+                auto queue_iterator = queue_multitasks.begin();
+                while (queue_iterator != queue_multitasks.end())
+                {
+                    if (queue_iterator->subtasks_remaining.empty())
+                    {
+                        // all subtasks done == multitask is done
+                        task_multi current_multitask = *queue_iterator;
+                        callback_finish_multitask(current_multitask);
+                        // remove this multitask
+                        queue_iterator = queue_multitasks.erase(queue_iterator);
+                    }
+                    else
+                    {
+                        ++queue_iterator;
+                    }
+                }
+                // all tasks in the current loop is finished
+                callback_all_task_finished();
+            }
+            LOG_VERBOSE("wait for new task", {});
+            // wait for new task
+            {
+                std::unique_lock<std::mutex> lock(mutex_tasks);
+                if (queue_tasks.empty()) {
+                    condition_tasks.wait(lock, [&]{
+                        return !queue_tasks.empty();
+                    });
+                }
+            }
+        }
+    }
+
+    //
+    // functions to manage multitasks
+    //
+
+    // add a multitask by specifying the id of all subtask (subtask is a task_server)
+    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        task_multi multi;
+        multi.id = multitask_id;
+        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
+        queue_multitasks.push_back(multi);
+    }
+
+    // updatethe remaining subtasks, while appending results to multitask
+    void update_multitask(int multitask_id, int subtask_id, task_result& result)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        for (auto& multitask : queue_multitasks)
+        {
+            if (multitask.id == multitask_id)
+            {
+                multitask.subtasks_remaining.erase(subtask_id);
+                multitask.results.push_back(result);
+            }
+        }
+    }
+};
+
+struct llama_server_response {
+    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
+    callback_multitask_t callback_update_multitask;
+    // for keeping track of all tasks waiting for the result
+    std::set<int> waiting_task_ids;
+    // the main result queue
+    std::vector<task_result> queue_results;
+    std::mutex mutex_results;
+    std::condition_variable condition_results;
+
+    void add_waiting_task_id(int task_id) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.insert(task_id);
+    }
+
+    void remove_waiting_task_id(int task_id) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.erase(task_id);
+    }
+
+    // This function blocks the thread until there is a response for this task_id
+    task_result recv(int task_id) {
+        while (true)
+        {
+            std::unique_lock<std::mutex> lock(mutex_results);
+            condition_results.wait(lock, [&]{
+                return !queue_results.empty();
+            });
+            LOG_VERBOSE("condition_results unblock", {});
+
+            for (int i = 0; i < (int) queue_results.size(); i++)
+            {
+                if (queue_results[i].id == task_id)
+                {
+                    assert(queue_results[i].multitask_id == -1);
+                    task_result res = queue_results[i];
+                    queue_results.erase(queue_results.begin() + i);
+                    return res;
+                }
+            }
+        }
+
+        // should never reach here
+    }
+
+    // Register the function to update multitask
+    void on_multitask_update(callback_multitask_t callback) {
+        callback_update_multitask = callback;
+    }
+
+    // Send a new result to a waiting task_id
+    void send(task_result result) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        LOG_VERBOSE("send new result", {});
+        for (auto& task_id : waiting_task_ids) {
+            // LOG_TEE("waiting task id %i \n", task_id);
+            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
+            if (result.multitask_id == task_id)
+            {
+                LOG_VERBOSE("callback_update_multitask", {});
+                callback_update_multitask(task_id, result.id, result);
+                continue;
+            }
+
+            if (result.id == task_id)
+            {
+                LOG_VERBOSE("queue_results.push_back", {});
+                queue_results.push_back(result);
+                condition_results.notify_one();
+                return;
+            }
+        }
+    }
+};
+
+//
+// base64 utils (TODO: move to common in the future)
+//
+
+static const std::string base64_chars =
+             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+             "abcdefghijklmnopqrstuvwxyz"
+             "0123456789+/";
+
+static inline bool is_base64(uint8_t c)
+{
+    return (isalnum(c) || (c == '+') || (c == '/'));
+}
+
+static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
+{
+    int i = 0;
+    int j = 0;
+    int in_ = 0;
+
+    int in_len = encoded_string.size();
+
+    uint8_t char_array_4[4];
+    uint8_t char_array_3[3];
+
+    std::vector<uint8_t> ret;
+
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
+    {
+        char_array_4[i++] = encoded_string[in_]; in_++;
+        if (i == 4)
+        {
+            for (i = 0; i <4; i++)
+            {
+                char_array_4[i] = base64_chars.find(char_array_4[i]);
+            }
+
+            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+            for (i = 0; (i < 3); i++)
+            {
+                ret.push_back(char_array_3[i]);
+            }
+            i = 0;
+        }
+    }
+
+    if (i)
+    {
+        for (j = i; j <4; j++)
+        {
+            char_array_4[j] = 0;
+        }
+
+        for (j = 0; j <4; j++)
+        {
+            char_array_4[j] = base64_chars.find(char_array_4[j]);
+        }
+
+        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+        for (j = 0; (j < i - 1); j++)
+        {
+            ret.push_back(char_array_3[j]);
+        }
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/backend/index.yaml b/backend/index.yaml
index 1546c1af3..a1f5688a8 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -29,6 +29,20 @@
     nvidia-cuda-12: "cuda12-llama-cpp"
     nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp"
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp"
+- &ikllamacpp
+  name: "ik-llama-cpp"
+  alias: "ik-llama-cpp"
+  license: mit
+  description: |
+    Fork of llama.cpp optimized for CPU performance by ikawrakow
+  urls:
+    - https://github.com/ikawrakow/ik_llama.cpp
+  tags:
+    - text-to-text
+    - LLM
+    - CPU
+  capabilities:
+    default: "cpu-ik-llama-cpp"
 - &whispercpp
   name: "whisper"
   alias: "whisper"
@@ -897,6 +911,10 @@
     nvidia-cuda-12: "cuda12-llama-cpp-development"
     nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-development"
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-development"
+- !!merge <<: *ikllamacpp
+  name: "ik-llama-cpp-development"
+  capabilities:
+    default: "cpu-ik-llama-cpp-development"
 - !!merge <<: *neutts
   name: "cpu-neutts"
   uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-neutts"
@@ -1327,6 +1345,17 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp"
   mirrors:
     - localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp
+## ik-llama-cpp
+- !!merge <<: *ikllamacpp
+  name: "cpu-ik-llama-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ik-llama-cpp"
+  mirrors:
+    - localai/localai-backends:latest-cpu-ik-llama-cpp
+- !!merge <<: *ikllamacpp
+  name: "cpu-ik-llama-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ik-llama-cpp"
+  mirrors:
+    - localai/localai-backends:master-cpu-ik-llama-cpp
 ## whisper
 - !!merge <<: *whispercpp
   name: "nvidia-l4t-arm64-whisper"
diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md
index aa8608ec3..bfcbf650a 100644
--- a/docs/content/features/text-generation.md
+++ b/docs/content/features/text-generation.md
@@ -539,6 +539,47 @@ options:
 - [llama](https://github.com/ggerganov/llama.cpp)
 
 
+### ik_llama.cpp
+
+[ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp) is a hard fork of `llama.cpp` by Iwan Kawrakow that focuses on superior CPU and hybrid GPU/CPU performance. It ships additional quantization types (IQK quants), custom quantization mixes, Multi-head Latent Attention (MLA) for DeepSeek models, and fine-grained tensor offload controls — particularly useful for running very large models on commodity CPU hardware.
+
+{{% notice note %}}
+
+The `ik-llama-cpp` backend requires a CPU with **AVX2** support. The IQK kernels are not compatible with older CPUs.
+
+{{% /notice %}}
+
+#### Features
+
+The `ik-llama-cpp` backend supports the following features:
+- [📖 Text generation (GPT)]({{%relref "features/text-generation" %}})
+- [🧠 Embeddings]({{%relref "features/embeddings" %}})
+- IQK quantization types for better CPU inference performance
+- Multimodal models (via clip/llava)
+
+#### Setup
+
+The backend is distributed as a separate container image and can be installed from the LocalAI backend gallery, or specified directly in a model configuration. GGUF models loaded with this backend benefit from ik_llama.cpp's optimized CPU kernels — especially useful for MoE models and large quantized models that would otherwise be GPU-bound.
+
+#### YAML configuration
+
+To use the `ik-llama-cpp` backend, specify it as the backend in the YAML file:
+
+```yaml
+name: my-model
+backend: ik-llama-cpp
+parameters:
+  # Relative to the models path
+  model: file.gguf
+```
+
+The aliases `ik-llama` and `ik_llama` are also accepted.
+
+#### Reference
+
+- [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp)
+
+
 ### vLLM
 
 [vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference.
diff --git a/docs/content/reference/compatibility-table.md b/docs/content/reference/compatibility-table.md
index 80cf4e781..5a2ce0cf2 100644
--- a/docs/content/reference/compatibility-table.md
+++ b/docs/content/reference/compatibility-table.md
@@ -19,6 +19,7 @@ LocalAI will attempt to automatically load models which are not explicitly confi
 | Backend | Description | Capability | Embeddings | Streaming | Acceleration |
 |---------|-------------|------------|------------|-----------|-------------|
 | [llama.cpp](https://github.com/ggerganov/llama.cpp) | LLM inference in C/C++. Supports LLaMA, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | GPT, Functions | yes | yes | CPU, CUDA 12/13, ROCm, Intel SYCL, Vulkan, Metal, Jetson L4T |
+| [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp) | Hard fork of llama.cpp optimized for CPU/hybrid CPU+GPU with IQK quants, custom quant mixes, and MLA for DeepSeek | GPT | yes | yes | CPU (AVX2+) |
 | [vLLM](https://github.com/vllm-project/vllm) | Fast LLM serving with PagedAttention | GPT | no | no | CUDA 12, ROCm, Intel |
 | [vLLM Omni](https://github.com/vllm-project/vllm) | Unified multimodal generation (text, image, video, audio) | Multimodal GPT | no | no | CUDA 12, ROCm |
 | [transformers](https://github.com/huggingface/transformers) | HuggingFace Transformers framework | GPT, Embeddings, Multimodal | yes | yes* | CPU, CUDA 12/13, ROCm, Intel, Metal |
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index 2c75a2245..a3de78dd4 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -14,12 +14,15 @@ import (
 )
 
 const (
-	LLamaCPP = "llama-cpp"
+	LLamaCPP   = "llama-cpp"
+	IKLLamaCPP = "ik-llama-cpp"
 )
 
 var Aliases = map[string]string{
 	"go-llama":               LLamaCPP,
 	"llama":                  LLamaCPP,
+	"ik_llama":               IKLLamaCPP,
+	"ik-llama":               IKLLamaCPP,
 	"embedded-store":         LocalStoreBackend,
 	"huggingface-embeddings": TransformersBackend,
 	"transformers-musicgen":  TransformersBackend,
diff --git a/scripts/changed-backends.js b/scripts/changed-backends.js
index da2486300..4ef8b1874 100644
--- a/scripts/changed-backends.js
+++ b/scripts/changed-backends.js
@@ -24,6 +24,9 @@ function inferBackendPath(item) {
   if (item.dockerfile.endsWith("rust")) {
     return `backend/rust/${item.backend}/`;
   }
+  if (item.dockerfile.endsWith("ik-llama-cpp")) {
+    return `backend/cpp/ik-llama-cpp/`;
+  }
   if (item.dockerfile.endsWith("llama-cpp")) {
     return `backend/cpp/llama-cpp/`;
   }
diff --git a/tests/e2e-backends/backend_test.go b/tests/e2e-backends/backend_test.go
new file mode 100644
index 000000000..a800a7ab5
--- /dev/null
+++ b/tests/e2e-backends/backend_test.go
@@ -0,0 +1,342 @@
+package e2ebackends_test
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"github.com/phayes/freeport"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
+)
+
+// Environment variables consumed by the suite.
+//
+// Required (one of):
+//
+//	BACKEND_IMAGE            Docker image tag to test (e.g. local-ai-backend:llama-cpp).
+//
+// Required model source (one of):
+//
+//	BACKEND_TEST_MODEL_URL   HTTP(S) URL of a model file to download before the test.
+//	BACKEND_TEST_MODEL_FILE  Path to an already-available model file (skips download).
+//
+// Optional:
+//
+//	BACKEND_TEST_CAPS        Comma-separated list of capabilities to exercise.
+//	                         Supported values: health, load, predict, stream, embeddings.
+//	                         Defaults to "health,load,predict,stream".
+//	                         A backend that only does embeddings would set this to
+//	                         "health,load,embeddings"; an image/TTS backend that cannot
+//	                         be driven by a text prompt can set it to "health,load".
+//	BACKEND_TEST_PROMPT      Override the prompt used by predict/stream specs.
+//	BACKEND_TEST_CTX_SIZE    Override the context size passed to LoadModel (default 512).
+//	BACKEND_TEST_THREADS     Override Threads passed to LoadModel (default 4).
+//
+// The suite is intentionally model-format-agnostic: it only ever passes the
+// file path to LoadModel, so GGUF, ONNX, safetensors, .bin etc. all work so
+// long as the backend under test accepts that format.
+const (
+	capHealth     = "health"
+	capLoad       = "load"
+	capPredict    = "predict"
+	capStream     = "stream"
+	capEmbeddings = "embeddings"
+
+	defaultPrompt = "The capital of France is"
+	streamPrompt  = "Once upon a time"
+)
+
+func defaultCaps() map[string]bool {
+	return map[string]bool{
+		capHealth:  true,
+		capLoad:    true,
+		capPredict: true,
+		capStream:  true,
+	}
+}
+
+// parseCaps reads BACKEND_TEST_CAPS and returns the enabled capability set.
+// An empty/unset value falls back to defaultCaps().
+func parseCaps() map[string]bool {
+	raw := strings.TrimSpace(os.Getenv("BACKEND_TEST_CAPS"))
+	if raw == "" {
+		return defaultCaps()
+	}
+	caps := map[string]bool{}
+	for _, part := range strings.Split(raw, ",") {
+		part = strings.TrimSpace(strings.ToLower(part))
+		if part != "" {
+			caps[part] = true
+		}
+	}
+	return caps
+}
+
+var _ = Describe("Backend container", Ordered, func() {
+	var (
+		caps      map[string]bool
+		workDir   string
+		binaryDir string
+		modelFile string
+		addr      string
+		serverCmd *exec.Cmd
+		conn      *grpc.ClientConn
+		client    pb.BackendClient
+		prompt    string
+	)
+
+	BeforeAll(func() {
+		image := os.Getenv("BACKEND_IMAGE")
+		Expect(image).NotTo(BeEmpty(), "BACKEND_IMAGE env var must be set (e.g. local-ai-backend:llama-cpp)")
+
+		modelURL := os.Getenv("BACKEND_TEST_MODEL_URL")
+		modelFile = os.Getenv("BACKEND_TEST_MODEL_FILE")
+		Expect(modelURL != "" || modelFile != "").To(BeTrue(),
+			"one of BACKEND_TEST_MODEL_URL or BACKEND_TEST_MODEL_FILE must be set")
+
+		caps = parseCaps()
+		GinkgoWriter.Printf("Testing image=%q with capabilities=%v\n", image, keys(caps))
+
+		prompt = os.Getenv("BACKEND_TEST_PROMPT")
+		if prompt == "" {
+			prompt = defaultPrompt
+		}
+
+		var err error
+		workDir, err = os.MkdirTemp("", "backend-e2e-*")
+		Expect(err).NotTo(HaveOccurred())
+
+		// Extract the image filesystem so we can run run.sh directly.
+		binaryDir = filepath.Join(workDir, "rootfs")
+		Expect(os.MkdirAll(binaryDir, 0o755)).To(Succeed())
+		extractImage(image, binaryDir)
+		Expect(filepath.Join(binaryDir, "run.sh")).To(BeAnExistingFile())
+
+		// Download the model once if not provided.
+		if modelFile == "" {
+			modelFile = filepath.Join(workDir, "model.bin")
+			downloadFile(modelURL, modelFile)
+		}
+
+		// Pick a free port and launch the backend.
+		port, err := freeport.GetFreePort()
+		Expect(err).NotTo(HaveOccurred())
+		addr = fmt.Sprintf("127.0.0.1:%d", port)
+
+		Expect(os.Chmod(filepath.Join(binaryDir, "run.sh"), 0o755)).To(Succeed())
+		// Mark any other top-level files executable (extraction may strip perms).
+		entries, _ := os.ReadDir(binaryDir)
+		for _, e := range entries {
+			if !e.IsDir() && !strings.HasSuffix(e.Name(), ".sh") {
+				_ = os.Chmod(filepath.Join(binaryDir, e.Name()), 0o755)
+			}
+		}
+
+		serverCmd = exec.Command(filepath.Join(binaryDir, "run.sh"), "--addr="+addr)
+		serverCmd.Stdout = GinkgoWriter
+		serverCmd.Stderr = GinkgoWriter
+		Expect(serverCmd.Start()).To(Succeed())
+
+		// Wait for the gRPC port to accept connections.
+		Eventually(func() error {
+			c, err := net.DialTimeout("tcp", addr, 500*time.Millisecond)
+			if err != nil {
+				return err
+			}
+			_ = c.Close()
+			return nil
+		}, 30*time.Second, 200*time.Millisecond).Should(Succeed(), "backend did not start")
+
+		conn, err = grpc.Dial(addr,
+			grpc.WithTransportCredentials(insecure.NewCredentials()),
+			grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(50*1024*1024)),
+		)
+		Expect(err).NotTo(HaveOccurred())
+		client = pb.NewBackendClient(conn)
+	})
+
+	AfterAll(func() {
+		if conn != nil {
+			_ = conn.Close()
+		}
+		if serverCmd != nil && serverCmd.Process != nil {
+			_ = serverCmd.Process.Kill()
+			_, _ = serverCmd.Process.Wait()
+		}
+		if workDir != "" {
+			_ = os.RemoveAll(workDir)
+		}
+	})
+
+	It("responds to Health", func() {
+		if !caps[capHealth] {
+			Skip("health capability not enabled")
+		}
+		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+		defer cancel()
+		res, err := client.Health(ctx, &pb.HealthMessage{})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.GetMessage()).NotTo(BeEmpty())
+	})
+
+	It("loads the model", func() {
+		if !caps[capLoad] {
+			Skip("load capability not enabled")
+		}
+		ctxSize := envInt32("BACKEND_TEST_CTX_SIZE", 512)
+		threads := envInt32("BACKEND_TEST_THREADS", 4)
+
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+		defer cancel()
+		res, err := client.LoadModel(ctx, &pb.ModelOptions{
+			Model:       modelFile,
+			ModelFile:   modelFile,
+			ContextSize: ctxSize,
+			Threads:     threads,
+			NGPULayers:  0,
+			MMap:        true,
+			NBatch:      128,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.GetSuccess()).To(BeTrue(), "LoadModel failed: %s", res.GetMessage())
+	})
+
+	It("generates output via Predict", func() {
+		if !caps[capPredict] {
+			Skip("predict capability not enabled")
+		}
+		ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
+		defer cancel()
+		res, err := client.Predict(ctx, &pb.PredictOptions{
+			Prompt:      prompt,
+			Tokens:      20,
+			Temperature: 0.1,
+			TopK:        40,
+			TopP:        0.9,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.GetMessage()).NotTo(BeEmpty(), "Predict produced empty output")
+		GinkgoWriter.Printf("Predict: %q (tokens=%d, prompt_tokens=%d)\n",
+			res.GetMessage(), res.GetTokens(), res.GetPromptTokens())
+	})
+
+	It("streams output via PredictStream", func() {
+		if !caps[capStream] {
+			Skip("stream capability not enabled")
+		}
+		ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
+		defer cancel()
+		stream, err := client.PredictStream(ctx, &pb.PredictOptions{
+			Prompt:      streamPrompt,
+			Tokens:      20,
+			Temperature: 0.1,
+			TopK:        40,
+			TopP:        0.9,
+		})
+		Expect(err).NotTo(HaveOccurred())
+
+		var chunks int
+		var combined string
+		for {
+			msg, err := stream.Recv()
+			if err == io.EOF {
+				break
+			}
+			Expect(err).NotTo(HaveOccurred())
+			if len(msg.GetMessage()) > 0 {
+				chunks++
+				combined += string(msg.GetMessage())
+			}
+		}
+		Expect(chunks).To(BeNumerically(">", 0), "no stream chunks received")
+		GinkgoWriter.Printf("Stream: %d chunks, combined=%q\n", chunks, combined)
+	})
+
+	It("computes embeddings via Embedding", func() {
+		if !caps[capEmbeddings] {
+			Skip("embeddings capability not enabled")
+		}
+		ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+		defer cancel()
+		res, err := client.Embedding(ctx, &pb.PredictOptions{
+			Embeddings: prompt,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.GetEmbeddings()).NotTo(BeEmpty(), "Embedding returned empty vector")
+		GinkgoWriter.Printf("Embedding: %d dims\n", len(res.GetEmbeddings()))
+	})
+})
+
+// extractImage runs `docker create` + `docker export` to materialise the image
+// rootfs into dest. Using export (not save) avoids dealing with layer tarballs.
+func extractImage(image, dest string) {
+	GinkgoHelper()
+	// The backend images have no default ENTRYPOINT/CMD, so docker create fails
+	// unless we override one; run.sh is harmless and guaranteed to exist.
+	create := exec.Command("docker", "create", "--entrypoint=/run.sh", image)
+	out, err := create.CombinedOutput()
+	Expect(err).NotTo(HaveOccurred(), "docker create failed: %s", string(out))
+	cid := strings.TrimSpace(string(out))
+	DeferCleanup(func() {
+		_ = exec.Command("docker", "rm", "-f", cid).Run()
+	})
+
+	// Pipe `docker export <cid>` into `tar -xf - -C dest`.
+	exp := exec.Command("docker", "export", cid)
+	expOut, err := exp.StdoutPipe()
+	Expect(err).NotTo(HaveOccurred())
+	exp.Stderr = GinkgoWriter
+	Expect(exp.Start()).To(Succeed())
+
+	tar := exec.Command("tar", "-xf", "-", "-C", dest)
+	tar.Stdin = expOut
+	tar.Stderr = GinkgoWriter
+	Expect(tar.Run()).To(Succeed())
+	Expect(exp.Wait()).To(Succeed())
+}
+
+// downloadFile fetches url into dest using curl -L. Used for CI convenience;
+// local runs can use BACKEND_TEST_MODEL_FILE to skip downloading.
+func downloadFile(url, dest string) {
+	GinkgoHelper()
+	cmd := exec.Command("curl", "-sSfL", "-o", dest, url)
+	cmd.Stdout = GinkgoWriter
+	cmd.Stderr = GinkgoWriter
+	Expect(cmd.Run()).To(Succeed(), "failed to download %s", url)
+	fi, err := os.Stat(dest)
+	Expect(err).NotTo(HaveOccurred())
+	Expect(fi.Size()).To(BeNumerically(">", 1024), "downloaded file is suspiciously small")
+}
+
+func envInt32(name string, def int32) int32 {
+	raw := os.Getenv(name)
+	if raw == "" {
+		return def
+	}
+	var v int32
+	_, err := fmt.Sscanf(raw, "%d", &v)
+	if err != nil {
+		return def
+	}
+	return v
+}
+
+func keys(m map[string]bool) []string {
+	out := make([]string, 0, len(m))
+	for k, v := range m {
+		if v {
+			out = append(out, k)
+		}
+	}
+	return out
+}
diff --git a/tests/e2e-backends/suite_test.go b/tests/e2e-backends/suite_test.go
new file mode 100644
index 000000000..4ce1864d4
--- /dev/null
+++ b/tests/e2e-backends/suite_test.go
@@ -0,0 +1,24 @@
+// Package e2ebackends exercises a built backend container image end-to-end over
+// its gRPC surface.
+//
+// The suite is intentionally backend-agnostic: it extracts a Docker image,
+// launches the bundled run.sh entrypoint, then drives a configurable set of
+// gRPC calls against the result. Specs are gated by capability flags so that a
+// non-LLM backend (e.g. image generation, TTS, embeddings-only) can opt in to
+// only the RPCs it implements.
+//
+// Configuration is entirely through environment variables — see backend_test.go
+// for the full list.
+package e2ebackends_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestBackendE2E(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Backend gRPC End-to-End Suite")
+}