mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-16 12:59:33 -04:00
feat(backends): add ik-llama-cpp (#9326)
* feat(backends): add ik-llama-cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore: add grpc e2e suite, hook to CI, update README Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Apply suggestion from @mudler Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> * Apply suggestion from @mudler Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
151ad271f2
commit
9ca03cf9cc
13
.github/workflows/backend.yml
vendored
13
.github/workflows/backend.yml
vendored
@@ -1945,6 +1945,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-ik-llama-cpp'
|
||||
runs-on: 'bigger-runner'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ik-llama-cpp"
|
||||
dockerfile: "./backend/Dockerfile.ik-llama-cpp"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
|
||||
4
.github/workflows/bump_deps.yaml
vendored
4
.github/workflows/bump_deps.yaml
vendored
@@ -14,6 +14,10 @@ jobs:
|
||||
variable: "LLAMA_VERSION"
|
||||
branch: "master"
|
||||
file: "backend/cpp/llama-cpp/Makefile"
|
||||
- repository: "ikawrakow/ik_llama.cpp"
|
||||
variable: "IK_LLAMA_VERSION"
|
||||
branch: "main"
|
||||
file: "backend/cpp/ik-llama-cpp/Makefile"
|
||||
- repository: "ggml-org/whisper.cpp"
|
||||
variable: "WHISPER_CPP_VERSION"
|
||||
branch: "master"
|
||||
|
||||
36
.github/workflows/test-extra.yml
vendored
36
.github/workflows/test-extra.yml
vendored
@@ -29,6 +29,8 @@ jobs:
|
||||
nemo: ${{ steps.detect.outputs.nemo }}
|
||||
voxcpm: ${{ steps.detect.outputs.voxcpm }}
|
||||
llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }}
|
||||
llama-cpp: ${{ steps.detect.outputs.llama-cpp }}
|
||||
ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
|
||||
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
|
||||
qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }}
|
||||
voxtral: ${{ steps.detect.outputs.voxtral }}
|
||||
@@ -465,6 +467,40 @@ jobs:
|
||||
- name: Test llama-cpp-quantization
|
||||
run: |
|
||||
make --jobs=5 --output-sync=target -C backend/python/llama-cpp-quantization test
|
||||
tests-llama-cpp-grpc:
|
||||
needs: detect-changes
|
||||
if: needs.detect-changes.outputs.llama-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.25.4'
|
||||
- name: Build llama-cpp backend image and run gRPC e2e tests
|
||||
run: |
|
||||
make test-extra-backend-llama-cpp
|
||||
tests-ik-llama-cpp-grpc:
|
||||
needs: detect-changes
|
||||
if: needs.detect-changes.outputs.ik-llama-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.25.4'
|
||||
- name: Build ik-llama-cpp backend image and run gRPC e2e tests
|
||||
run: |
|
||||
make test-extra-backend-ik-llama-cpp
|
||||
tests-acestep-cpp:
|
||||
needs: detect-changes
|
||||
if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||
|
||||
46
Makefile
46
Makefile
@@ -456,6 +456,47 @@ test-extra: prepare-test-extra
|
||||
$(MAKE) -C backend/python/trl test
|
||||
$(MAKE) -C backend/rust/kokoros test
|
||||
|
||||
##
|
||||
## End-to-end gRPC tests that exercise a built backend container image.
|
||||
##
|
||||
## The test suite in tests/e2e-backends is backend-agnostic. You drive it via env
|
||||
## vars (see tests/e2e-backends/backend_test.go for the full list) and the
|
||||
## capability-driven harness picks which gRPC RPCs to exercise:
|
||||
##
|
||||
## BACKEND_IMAGE Required. Docker image to test, e.g. local-ai-backend:llama-cpp.
|
||||
## BACKEND_TEST_MODEL_URL URL of a model file to download and load.
|
||||
## BACKEND_TEST_MODEL_FILE Path to an already-downloaded model (skips download).
|
||||
## BACKEND_TEST_CAPS Comma-separated capabilities, default "health,load,predict,stream".
|
||||
## BACKEND_TEST_PROMPT Override the prompt used in predict/stream specs.
|
||||
##
|
||||
## Direct usage (image already built, no docker-build-* dependency):
|
||||
##
|
||||
## make test-extra-backend BACKEND_IMAGE=local-ai-backend:llama-cpp \
|
||||
## BACKEND_TEST_MODEL_URL=https://.../model.gguf
|
||||
##
|
||||
## Convenience wrappers below build a specific backend image first, then run the
|
||||
## suite against it.
|
||||
##
|
||||
BACKEND_TEST_MODEL_URL?=https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf
|
||||
|
||||
## Generic target — runs the suite against whatever BACKEND_IMAGE points at.
|
||||
## Depends on protogen-go so pkg/grpc/proto is generated before `go test`.
|
||||
test-extra-backend: protogen-go
|
||||
@test -n "$$BACKEND_IMAGE" || { echo "BACKEND_IMAGE must be set" >&2; exit 1; }
|
||||
BACKEND_IMAGE="$$BACKEND_IMAGE" \
|
||||
BACKEND_TEST_MODEL_URL="$${BACKEND_TEST_MODEL_URL:-$(BACKEND_TEST_MODEL_URL)}" \
|
||||
BACKEND_TEST_MODEL_FILE="$$BACKEND_TEST_MODEL_FILE" \
|
||||
BACKEND_TEST_CAPS="$$BACKEND_TEST_CAPS" \
|
||||
BACKEND_TEST_PROMPT="$$BACKEND_TEST_PROMPT" \
|
||||
go test -v -timeout 15m ./tests/e2e-backends/...
|
||||
|
||||
## Convenience wrappers: build the image, then exercise it.
|
||||
test-extra-backend-llama-cpp: docker-build-llama-cpp
|
||||
BACKEND_IMAGE=local-ai-backend:llama-cpp $(MAKE) test-extra-backend
|
||||
|
||||
test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp
|
||||
BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend
|
||||
|
||||
DOCKER_IMAGE?=local-ai
|
||||
IMAGE_TYPE?=core
|
||||
BASE_IMAGE?=ubuntu:24.04
|
||||
@@ -549,6 +590,8 @@ backend-images:
|
||||
# Backend metadata: BACKEND_NAME | DOCKERFILE_TYPE | BUILD_CONTEXT | PROGRESS_FLAG | NEEDS_BACKEND_ARG
|
||||
# llama-cpp is special - uses llama-cpp Dockerfile and doesn't need BACKEND arg
|
||||
BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false
|
||||
# ik-llama-cpp is a fork of llama.cpp with superior CPU performance
|
||||
BACKEND_IK_LLAMA_CPP = ik-llama-cpp|ik-llama-cpp|.|false|false
|
||||
|
||||
# Golang backends
|
||||
BACKEND_PIPER = piper|golang|.|false|true
|
||||
@@ -619,6 +662,7 @@ endef
|
||||
|
||||
# Generate all docker-build targets
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_IK_LLAMA_CPP)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
|
||||
@@ -663,7 +707,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP)))
|
||||
docker-save-%: backend-images
|
||||
docker save local-ai-backend:$* -o backend-images/$*.tar
|
||||
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp
|
||||
|
||||
########################################################
|
||||
### Mock Backend for E2E Tests
|
||||
|
||||
281
backend/Dockerfile.ik-llama-cpp
Normal file
281
backend/Dockerfile.ik-llama-cpp
Normal file
@@ -0,0 +1,281 @@
|
||||
ARG BASE_IMAGE=ubuntu:24.04
|
||||
ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
|
||||
|
||||
|
||||
# The grpc target does one thing, it builds and installs GRPC. This is in it's own layer so that it can be effectively cached by CI.
|
||||
# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
|
||||
FROM ${GRPC_BASE_IMAGE} AS grpc
|
||||
|
||||
# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
|
||||
ARG GRPC_MAKEFLAGS="-j4 -Otarget"
|
||||
ARG GRPC_VERSION=v1.65.0
|
||||
ARG CMAKE_FROM_SOURCE=false
|
||||
# CUDA Toolkit 13.x compatibility: CMake 3.31.9+ fixes toolchain detection/arch table issues
|
||||
ARG CMAKE_VERSION=3.31.10
|
||||
|
||||
ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
build-essential curl libssl-dev \
|
||||
git wget && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CMake (the version in 22.04 is too old)
|
||||
RUN <<EOT bash
|
||||
if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
|
||||
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||
else
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
|
||||
# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
|
||||
# and running make install in the target container
|
||||
RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
|
||||
mkdir -p /build/grpc/cmake/build && \
|
||||
cd /build/grpc/cmake/build && \
|
||||
sed -i "216i\ TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
|
||||
cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
|
||||
make && \
|
||||
make install && \
|
||||
rm -rf /build
|
||||
|
||||
FROM ${BASE_IMAGE} AS builder
|
||||
ARG CMAKE_FROM_SOURCE=false
|
||||
ARG CMAKE_VERSION=3.31.10
|
||||
# We can target specific CUDA ARCHITECTURES like --build-arg CUDA_DOCKER_ARCH='75;86;89;120'
|
||||
ARG CUDA_DOCKER_ARCH
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
ARG CMAKE_ARGS
|
||||
ENV CMAKE_ARGS=${CMAKE_ARGS}
|
||||
ARG BACKEND=rerankers
|
||||
ARG BUILD_TYPE
|
||||
ENV BUILD_TYPE=${BUILD_TYPE}
|
||||
ARG CUDA_MAJOR_VERSION
|
||||
ARG CUDA_MINOR_VERSION
|
||||
ARG SKIP_DRIVERS=false
|
||||
ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
|
||||
ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
ARG GO_VERSION=1.25.4
|
||||
ARG UBUNTU_VERSION=2404
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
ccache git \
|
||||
ca-certificates \
|
||||
make \
|
||||
pkg-config libcurl4-openssl-dev \
|
||||
curl unzip \
|
||||
libssl-dev wget && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Cuda
|
||||
ENV PATH=/usr/local/cuda/bin:${PATH}
|
||||
|
||||
# HipBLAS requirements
|
||||
ENV PATH=/opt/rocm/bin:${PATH}
|
||||
|
||||
|
||||
# Vulkan requirements
|
||||
RUN <<EOT bash
|
||||
if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
software-properties-common pciutils wget gpg-agent && \
|
||||
apt-get install -y libglm-dev cmake libxcb-dri3-0 libxcb-present0 libpciaccess0 \
|
||||
libpng-dev libxcb-keysyms1-dev libxcb-dri3-dev libx11-dev g++ gcc \
|
||||
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
||||
if [ "amd64" = "$TARGETARCH" ]; then
|
||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||
rm vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||
mkdir -p /opt/vulkan-sdk && \
|
||||
mv 1.4.335.0 /opt/vulkan-sdk/ && \
|
||||
cd /opt/vulkan-sdk/1.4.335.0 && \
|
||||
./vulkansdk --no-deps --maxjobs \
|
||||
vulkan-loader \
|
||||
vulkan-validationlayers \
|
||||
vulkan-extensionlayer \
|
||||
vulkan-tools \
|
||||
shaderc && \
|
||||
cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/bin/* /usr/bin/ && \
|
||||
cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/lib/* /usr/lib/x86_64-linux-gnu/ && \
|
||||
cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/include/* /usr/include/ && \
|
||||
cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/share/* /usr/share/ && \
|
||||
rm -rf /opt/vulkan-sdk
|
||||
fi
|
||||
if [ "arm64" = "$TARGETARCH" ]; then
|
||||
mkdir vulkan && cd vulkan && \
|
||||
curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.335.0/vulkansdk-ubuntu-24.04-arm-1.4.335.0.tar.xz && \
|
||||
tar -xvf vulkan-sdk.tar.xz && \
|
||||
rm vulkan-sdk.tar.xz && \
|
||||
cd 1.4.335.0 && \
|
||||
cp -rfv aarch64/bin/* /usr/bin/ && \
|
||||
cp -rfv aarch64/lib/* /usr/lib/aarch64-linux-gnu/ && \
|
||||
cp -rfv aarch64/include/* /usr/include/ && \
|
||||
cp -rfv aarch64/share/* /usr/share/ && \
|
||||
cd ../.. && \
|
||||
rm -rf vulkan
|
||||
fi
|
||||
ldconfig && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
# CuBLAS requirements
|
||||
RUN <<EOT bash
|
||||
if ( [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "l4t" ] ) && [ "${SKIP_DRIVERS}" = "false" ]; then
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
software-properties-common pciutils
|
||||
if [ "amd64" = "$TARGETARCH" ]; then
|
||||
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
|
||||
fi
|
||||
if [ "arm64" = "$TARGETARCH" ]; then
|
||||
if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
|
||||
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
|
||||
else
|
||||
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
|
||||
fi
|
||||
fi
|
||||
dpkg -i cuda-keyring_1.1-1_all.deb && \
|
||||
rm -f cuda-keyring_1.1-1_all.deb && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||
libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||
libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
|
||||
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
|
||||
apt-get install -y --no-install-recommends \
|
||||
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
|
||||
fi
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
|
||||
# https://github.com/NVIDIA/Isaac-GR00T/issues/343
|
||||
RUN <<EOT bash
|
||||
if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
|
||||
wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
|
||||
dpkg -i cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
|
||||
cp /var/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
|
||||
apt-get update && apt-get -y install cudss cudss-cuda-${CUDA_MAJOR_VERSION} && \
|
||||
wget https://developer.download.nvidia.com/compute/nvpl/25.5/local_installers/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
|
||||
dpkg -i nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
|
||||
cp /var/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5/nvpl-*-keyring.gpg /usr/share/keyrings/ && \
|
||||
apt-get update && apt-get install -y nvpl
|
||||
fi
|
||||
EOT
|
||||
|
||||
# If we are building with clblas support, we need the libraries for the builds
|
||||
RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
libclblast-dev && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* \
|
||||
; fi
|
||||
|
||||
RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
hipblas-dev \
|
||||
rocblas-dev && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
# I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
|
||||
# to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
|
||||
ldconfig \
|
||||
; fi
|
||||
|
||||
RUN echo "TARGETARCH: $TARGETARCH"
|
||||
|
||||
# We need protoc installed, and the version in 22.04 is too old. We will create one as part installing the GRPC build below
|
||||
# but that will also being in a newer version of absl which stablediffusion cannot compile with. This version of protoc is only
|
||||
# here so that we can generate the grpc code for the stablediffusion build
|
||||
RUN <<EOT bash
|
||||
if [ "amd64" = "$TARGETARCH" ]; then
|
||||
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
|
||||
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||
rm protoc.zip
|
||||
fi
|
||||
if [ "arm64" = "$TARGETARCH" ]; then
|
||||
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
|
||||
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||
rm protoc.zip
|
||||
fi
|
||||
EOT
|
||||
|
||||
# Install CMake (the version in 22.04 is too old)
|
||||
RUN <<EOT bash
|
||||
if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
|
||||
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||
else
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
COPY --from=grpc /opt/grpc /usr/local
|
||||
|
||||
|
||||
COPY . /LocalAI
|
||||
|
||||
RUN <<'EOT' bash
|
||||
set -euxo pipefail
|
||||
|
||||
if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
|
||||
CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
|
||||
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
|
||||
echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
|
||||
rm -rf /LocalAI/backend/cpp/ik-llama-cpp-*-build
|
||||
fi
|
||||
|
||||
cd /LocalAI/backend/cpp/ik-llama-cpp
|
||||
|
||||
if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
|
||||
# ARM64 / ROCm: build without x86 SIMD
|
||||
make ik-llama-cpp-fallback
|
||||
else
|
||||
# ik_llama.cpp's IQK kernels require at least AVX2
|
||||
make ik-llama-cpp-avx2
|
||||
fi
|
||||
EOT
|
||||
|
||||
|
||||
# Copy libraries using a script to handle architecture differences
|
||||
RUN make -BC /LocalAI/backend/cpp/ik-llama-cpp package
|
||||
|
||||
|
||||
FROM scratch
|
||||
|
||||
|
||||
# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
|
||||
COPY --from=builder /LocalAI/backend/cpp/ik-llama-cpp/package/. ./
|
||||
78
backend/cpp/ik-llama-cpp/CMakeLists.txt
Normal file
78
backend/cpp/ik-llama-cpp/CMakeLists.txt
Normal file
@@ -0,0 +1,78 @@
|
||||
## Clip/LLaVA library for multimodal support — built locally from copied sources
|
||||
set(TARGET myclip)
|
||||
add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
target_include_directories(myclip PUBLIC .)
|
||||
target_include_directories(myclip PUBLIC ../..)
|
||||
target_include_directories(myclip PUBLIC ../../common)
|
||||
target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if (NOT MSVC)
|
||||
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual)
|
||||
endif()
|
||||
|
||||
set(TARGET grpc-server)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
set(TARGET grpc-server)
|
||||
set(_PROTOBUF_LIBPROTOBUF libprotobuf)
|
||||
set(_REFLECTION grpc++_reflection)
|
||||
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
||||
if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
|
||||
set(HOMEBREW_DEFAULT_PREFIX "/opt/homebrew")
|
||||
else()
|
||||
set(HOMEBREW_DEFAULT_PREFIX "/usr/local")
|
||||
endif()
|
||||
link_directories("${HOMEBREW_DEFAULT_PREFIX}/lib")
|
||||
include_directories("${HOMEBREW_DEFAULT_PREFIX}/include")
|
||||
endif()
|
||||
|
||||
find_package(absl CONFIG REQUIRED)
|
||||
find_package(Protobuf CONFIG REQUIRED)
|
||||
find_package(gRPC CONFIG REQUIRED)
|
||||
|
||||
find_program(_PROTOBUF_PROTOC protoc)
|
||||
set(_GRPC_GRPCPP grpc++)
|
||||
find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
|
||||
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
||||
include_directories(${Protobuf_INCLUDE_DIRS})
|
||||
|
||||
message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
|
||||
|
||||
# Proto file
|
||||
get_filename_component(hw_proto "../../../../../../backend/backend.proto" ABSOLUTE)
|
||||
get_filename_component(hw_proto_path "${hw_proto}" PATH)
|
||||
|
||||
set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
|
||||
set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
|
||||
set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
|
||||
set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
|
||||
COMMAND ${_PROTOBUF_PROTOC}
|
||||
ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
|
||||
--cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
|
||||
-I "${hw_proto_path}"
|
||||
--plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
|
||||
"${hw_proto}"
|
||||
DEPENDS "${hw_proto}")
|
||||
|
||||
add_library(hw_grpc_proto
|
||||
${hw_grpc_srcs}
|
||||
${hw_grpc_hdrs}
|
||||
${hw_proto_srcs}
|
||||
${hw_proto_hdrs} )
|
||||
|
||||
add_executable(${TARGET} grpc-server.cpp json.hpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
|
||||
absl::flags_parse
|
||||
gRPC::${_REFLECTION}
|
||||
gRPC::${_GRPC_GRPCPP}
|
||||
protobuf::${_PROTOBUF_LIBPROTOBUF})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
167
backend/cpp/ik-llama-cpp/Makefile
Normal file
167
backend/cpp/ik-llama-cpp/Makefile
Normal file
@@ -0,0 +1,167 @@
|
||||
|
||||
IK_LLAMA_VERSION?=08ae48c667e3dcd3025821a8585190b4a46c2f7c
|
||||
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
BUILD_TYPE?=
|
||||
NATIVE?=false
|
||||
ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
||||
TARGET?=--target grpc-server
|
||||
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
|
||||
ARCH?=$(shell uname -m)
|
||||
|
||||
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
|
||||
|
||||
CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
ifeq ($(NATIVE),false)
|
||||
CMAKE_ARGS+=-DGGML_NATIVE=OFF -DLLAMA_OPENSSL=OFF
|
||||
endif
|
||||
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CMAKE_ARGS+=-DGGML_CUDA=ON
|
||||
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
# to CMAKE_ARGS automatically
|
||||
else ifeq ($(BUILD_TYPE),openblas)
|
||||
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
else ifeq ($(BUILD_TYPE),clblas)
|
||||
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
ROCM_HOME ?= /opt/rocm
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
export CXX=$(ROCM_HOME)/llvm/bin/clang++
|
||||
export CC=$(ROCM_HOME)/llvm/bin/clang
|
||||
AMDGPU_TARGETS?=gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
|
||||
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
|
||||
else ifeq ($(BUILD_TYPE),vulkan)
|
||||
CMAKE_ARGS+=-DGGML_VULKAN=1
|
||||
else ifeq ($(OS),Darwin)
|
||||
ifeq ($(BUILD_TYPE),)
|
||||
BUILD_TYPE=metal
|
||||
endif
|
||||
ifneq ($(BUILD_TYPE),metal)
|
||||
CMAKE_ARGS+=-DGGML_METAL=OFF
|
||||
else
|
||||
CMAKE_ARGS+=-DGGML_METAL=ON
|
||||
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
|
||||
CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
|
||||
CMAKE_ARGS+=-DGGML_OPENMP=OFF
|
||||
endif
|
||||
TARGET+=--target ggml-metal
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),sycl_f16)
|
||||
CMAKE_ARGS+=-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DCMAKE_CXX_FLAGS="-fsycl" \
|
||||
-DGGML_SYCL_F16=ON
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),sycl_f32)
|
||||
CMAKE_ARGS+=-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DCMAKE_CXX_FLAGS="-fsycl"
|
||||
endif
|
||||
|
||||
INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
|
||||
INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
|
||||
ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
|
||||
-DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
|
||||
-Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
|
||||
-DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
|
||||
-DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
|
||||
build-ik-llama-cpp-grpc-server:
|
||||
# Conditionally build grpc for the backend to use if needed
|
||||
ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
|
||||
$(MAKE) -C ../../grpc build
|
||||
_PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \
|
||||
_GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \
|
||||
PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
|
||||
CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
|
||||
IK_LLAMA_VERSION=$(IK_LLAMA_VERSION) \
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
|
||||
else
|
||||
echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
|
||||
IK_LLAMA_VERSION=$(IK_LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
|
||||
endif
|
||||
|
||||
ik-llama-cpp-avx2: llama.cpp
|
||||
cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx2-build
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx2-build purge
|
||||
$(info ${GREEN}I ik-llama-cpp build info:avx2${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="ik-llama-cpp-avx2-build" build-ik-llama-cpp-grpc-server
|
||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx2-build/grpc-server ik-llama-cpp-avx2
|
||||
|
||||
ik-llama-cpp-avx512: llama.cpp
|
||||
cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx512-build
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx512-build purge
|
||||
$(info ${GREEN}I ik-llama-cpp build info:avx512${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="ik-llama-cpp-avx512-build" build-ik-llama-cpp-grpc-server
|
||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx512-build/grpc-server ik-llama-cpp-avx512
|
||||
|
||||
ik-llama-cpp-avx: llama.cpp
|
||||
cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx-build
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx-build purge
|
||||
$(info ${GREEN}I ik-llama-cpp build info:avx${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="ik-llama-cpp-avx-build" build-ik-llama-cpp-grpc-server
|
||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-avx-build/grpc-server ik-llama-cpp-avx
|
||||
|
||||
ik-llama-cpp-fallback: llama.cpp
|
||||
cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-fallback-build
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-fallback-build purge
|
||||
$(info ${GREEN}I ik-llama-cpp build info:fallback${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="ik-llama-cpp-fallback-build" build-ik-llama-cpp-grpc-server
|
||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-fallback-build/grpc-server ik-llama-cpp-fallback
|
||||
|
||||
ik-llama-cpp-grpc: llama.cpp
|
||||
cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build purge
|
||||
$(info ${GREEN}I ik-llama-cpp build info:grpc${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="ik-llama-cpp-grpc-build" build-ik-llama-cpp-grpc-server
|
||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build/grpc-server ik-llama-cpp-grpc
|
||||
|
||||
ik-llama-cpp-rpc-server: ik-llama-cpp-grpc
|
||||
cp -rf $(CURRENT_MAKEFILE_DIR)/../ik-llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server ik-llama-cpp-rpc-server
|
||||
|
||||
llama.cpp:
|
||||
mkdir -p llama.cpp
|
||||
cd llama.cpp && \
|
||||
git init && \
|
||||
git remote add origin $(LLAMA_REPO) && \
|
||||
git fetch origin && \
|
||||
git checkout -b build $(IK_LLAMA_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
llama.cpp/examples/grpc-server: llama.cpp
|
||||
mkdir -p llama.cpp/examples/grpc-server
|
||||
bash prepare.sh
|
||||
|
||||
rebuild:
|
||||
bash prepare.sh
|
||||
rm -rf grpc-server
|
||||
$(MAKE) grpc-server
|
||||
|
||||
package:
|
||||
bash package.sh
|
||||
|
||||
purge:
|
||||
rm -rf llama.cpp/build
|
||||
rm -rf llama.cpp/examples/grpc-server
|
||||
rm -rf grpc-server
|
||||
|
||||
clean: purge
|
||||
rm -rf llama.cpp
|
||||
|
||||
grpc-server: llama.cpp llama.cpp/examples/grpc-server
|
||||
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
|
||||
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
||||
+bash -c "source $(ONEAPI_VARS); \
|
||||
cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)"
|
||||
else
|
||||
+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)
|
||||
endif
|
||||
cp llama.cpp/build/bin/grpc-server .
|
||||
2652
backend/cpp/ik-llama-cpp/grpc-server.cpp
Normal file
2652
backend/cpp/ik-llama-cpp/grpc-server.cpp
Normal file
File diff suppressed because it is too large
Load Diff
58
backend/cpp/ik-llama-cpp/package.sh
Normal file
58
backend/cpp/ik-llama-cpp/package.sh
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Script to copy the appropriate libraries based on architecture
|
||||
# This script is used in the final stage of the Dockerfile
|
||||
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
REPO_ROOT="${CURDIR}/../../.."
|
||||
|
||||
# Create lib directory
|
||||
mkdir -p $CURDIR/package/lib
|
||||
|
||||
cp -avrf $CURDIR/ik-llama-cpp-* $CURDIR/package/
|
||||
cp -rfv $CURDIR/run.sh $CURDIR/package/
|
||||
|
||||
# Detect architecture and copy appropriate libraries
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
# x86_64 architecture
|
||||
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
# ARM64 architecture
|
||||
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||
else
|
||||
echo "Error: Could not detect architecture"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Package GPU libraries based on BUILD_TYPE
|
||||
# The GPU library packaging script will detect BUILD_TYPE and copy appropriate GPU libraries
|
||||
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||
package_gpu_libs
|
||||
fi
|
||||
|
||||
echo "Packaging completed successfully"
|
||||
ls -liah $CURDIR/package/
|
||||
ls -liah $CURDIR/package/lib/
|
||||
@@ -0,0 +1,10 @@
|
||||
--- a/ggml/src/iqk/iqk_common.h
|
||||
+++ b/ggml/src/iqk/iqk_common.h
|
||||
@@ -9,6 +9,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "iqk_config.h"
|
||||
+#include <cstdint>
|
||||
|
||||
#if defined IQK_IMPLEMENT
|
||||
|
||||
49
backend/cpp/ik-llama-cpp/prepare.sh
Normal file
49
backend/cpp/ik-llama-cpp/prepare.sh
Normal file
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
|
||||
## Patches
|
||||
|
||||
## Apply patches from the `patches` directory
|
||||
if [ -d "patches" ]; then
|
||||
for patch in $(ls patches); do
|
||||
echo "Applying patch $patch"
|
||||
patch -d llama.cpp/ -p1 < patches/$patch
|
||||
done
|
||||
fi
|
||||
|
||||
set -e
|
||||
|
||||
cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
|
||||
cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
|
||||
cp -r utils.hpp llama.cpp/examples/grpc-server/
|
||||
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/examples/grpc-server/
|
||||
|
||||
## Copy clip/llava files for multimodal support (built as myclip library)
|
||||
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
|
||||
cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
|
||||
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
|
||||
# Prepend llama.h include to llava.h
|
||||
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
|
||||
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
|
||||
# Copy clip-impl.h if it exists
|
||||
if [ -f llama.cpp/examples/llava/clip-impl.h ]; then
|
||||
cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
|
||||
fi
|
||||
# Copy stb_image.h
|
||||
if [ -f llama.cpp/vendor/stb/stb_image.h ]; then
|
||||
cp -rfv llama.cpp/vendor/stb/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
|
||||
elif [ -f llama.cpp/common/stb_image.h ]; then
|
||||
cp -rfv llama.cpp/common/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
|
||||
fi
|
||||
|
||||
## Fix API compatibility in llava.cpp (llama_n_embd -> llama_model_n_embd)
|
||||
if [ -f llama.cpp/examples/grpc-server/llava.cpp ]; then
|
||||
sed -i 's/llama_n_embd(/llama_model_n_embd(/g' llama.cpp/examples/grpc-server/llava.cpp
|
||||
fi
|
||||
|
||||
set +e
|
||||
if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
|
||||
echo "grpc-server already added"
|
||||
else
|
||||
echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
|
||||
fi
|
||||
set -e
|
||||
40
backend/cpp/ik-llama-cpp/run.sh
Normal file
40
backend/cpp/ik-llama-cpp/run.sh
Normal file
@@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
# Get the absolute current dir where the script is located
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
|
||||
cd /
|
||||
|
||||
echo "CPU info:"
|
||||
grep -e "model\sname" /proc/cpuinfo | head -1
|
||||
grep -e "flags" /proc/cpuinfo | head -1
|
||||
|
||||
# ik_llama.cpp requires AVX2 — default to avx2 binary
|
||||
BINARY=ik-llama-cpp-avx2
|
||||
|
||||
if [ -e $CURDIR/ik-llama-cpp-fallback ] && ! grep -q -e "\savx2\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX2 NOT found, using fallback"
|
||||
BINARY=ik-llama-cpp-fallback
|
||||
fi
|
||||
|
||||
# Extend ld library path with the dir where this script is located/lib
|
||||
if [ "$(uname)" == "Darwin" ]; then
|
||||
export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
|
||||
#export DYLD_FALLBACK_LIBRARY_PATH=$CURDIR/lib:$DYLD_FALLBACK_LIBRARY_PATH
|
||||
else
|
||||
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
|
||||
fi
|
||||
|
||||
# If there is a lib/ld.so, use it
|
||||
if [ -f $CURDIR/lib/ld.so ]; then
|
||||
echo "Using lib/ld.so"
|
||||
echo "Using binary: $BINARY"
|
||||
exec $CURDIR/lib/ld.so $CURDIR/$BINARY "$@"
|
||||
fi
|
||||
|
||||
echo "Using binary: $BINARY"
|
||||
exec $CURDIR/$BINARY "$@"
|
||||
|
||||
# We should never reach this point, however just in case we do, run fallback
|
||||
exec $CURDIR/ik-llama-cpp-fallback "$@"
|
||||
483
backend/cpp/ik-llama-cpp/utils.hpp
Normal file
483
backend/cpp/ik-llama-cpp/utils.hpp
Normal file
@@ -0,0 +1,483 @@
|
||||
// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "json.hpp"
|
||||
|
||||
#include "clip.h"
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
extern bool server_verbose;
|
||||
|
||||
#ifndef SERVER_VERBOSE
|
||||
#define SERVER_VERBOSE 1
|
||||
#endif
|
||||
|
||||
#if SERVER_VERBOSE != 1
|
||||
#define LOG_VERBOSE(MSG, ...)
|
||||
#else
|
||||
#define LOG_VERBOSE(MSG, ...) \
|
||||
do \
|
||||
{ \
|
||||
if (server_verbose) \
|
||||
{ \
|
||||
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
|
||||
//
|
||||
// parallel
|
||||
//
|
||||
|
||||
enum server_state {
|
||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||
SERVER_STATE_ERROR // An error occurred, load_model failed
|
||||
};
|
||||
|
||||
enum task_type {
|
||||
TASK_TYPE_COMPLETION,
|
||||
TASK_TYPE_CANCEL,
|
||||
TASK_TYPE_NEXT_RESPONSE
|
||||
};
|
||||
|
||||
struct task_server {
|
||||
int id = -1; // to be filled by llama_server_queue
|
||||
int target_id;
|
||||
task_type type;
|
||||
json data;
|
||||
bool infill_mode = false;
|
||||
bool embedding_mode = false;
|
||||
int multitask_id = -1;
|
||||
};
|
||||
|
||||
struct task_result {
|
||||
int id;
|
||||
int multitask_id = -1;
|
||||
bool stop;
|
||||
bool error;
|
||||
json result_json;
|
||||
};
|
||||
|
||||
struct task_multi {
|
||||
int id;
|
||||
std::set<int> subtasks_remaining{};
|
||||
std::vector<task_result> results{};
|
||||
};
|
||||
|
||||
// TODO: can become bool if we can't find use of more states
|
||||
enum slot_state
|
||||
{
|
||||
IDLE,
|
||||
PROCESSING,
|
||||
};
|
||||
|
||||
enum slot_command
|
||||
{
|
||||
NONE,
|
||||
LOAD_PROMPT,
|
||||
RELEASE,
|
||||
};
|
||||
|
||||
struct slot_params
|
||||
{
|
||||
bool stream = true;
|
||||
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
|
||||
|
||||
uint32_t seed = -1; // RNG seed
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
|
||||
std::vector<std::string> antiprompt;
|
||||
|
||||
json input_prefix;
|
||||
json input_suffix;
|
||||
};
|
||||
|
||||
struct slot_image
|
||||
{
|
||||
int32_t id;
|
||||
|
||||
bool request_encode_image = false;
|
||||
float * image_embedding = nullptr;
|
||||
int32_t image_tokens = 0;
|
||||
|
||||
clip_image_u8 * img_data;
|
||||
|
||||
std::string prefix_prompt; // before of this image
|
||||
};
|
||||
|
||||
// completion token output with probabilities
|
||||
struct completion_token_output
|
||||
{
|
||||
struct token_prob
|
||||
{
|
||||
llama_token tok;
|
||||
float prob;
|
||||
};
|
||||
|
||||
std::vector<token_prob> probs;
|
||||
llama_token tok;
|
||||
std::string text_to_send;
|
||||
};
|
||||
|
||||
static inline void server_log(const char *level, const char *function, int line,
|
||||
const char *message, const nlohmann::ordered_json &extra)
|
||||
{
|
||||
nlohmann::ordered_json log
|
||||
{
|
||||
{"timestamp", time(nullptr)},
|
||||
{"level", level},
|
||||
{"function", function},
|
||||
{"line", line},
|
||||
{"message", message},
|
||||
};
|
||||
|
||||
if (!extra.empty())
|
||||
{
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
|
||||
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
printf("%.*s\n", (int)str.size(), str.data());
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
//
|
||||
// server utils
|
||||
//
|
||||
|
||||
template <typename T>
|
||||
static T json_value(const json &body, const std::string &key, const T &default_value)
|
||||
{
|
||||
// Fallback null to default value
|
||||
return body.contains(key) && !body.at(key).is_null()
|
||||
? body.value(key, default_value)
|
||||
: default_value;
|
||||
}
|
||||
|
||||
inline std::string format_chatml(std::vector<json> messages)
|
||||
{
|
||||
std::ostringstream chatml_msgs;
|
||||
|
||||
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||
chatml_msgs << "<|im_start|>"
|
||||
<< json_value(*it, "role", std::string("user")) << '\n';
|
||||
chatml_msgs << json_value(*it, "content", std::string(""))
|
||||
<< "<|im_end|>\n";
|
||||
}
|
||||
|
||||
chatml_msgs << "<|im_start|>assistant" << '\n';
|
||||
|
||||
return chatml_msgs.str();
|
||||
}
|
||||
|
||||
//
|
||||
// work queue utils
|
||||
//
|
||||
|
||||
struct llama_server_queue {
|
||||
int id = 0;
|
||||
std::mutex mutex_tasks;
|
||||
// queues
|
||||
std::vector<task_server> queue_tasks;
|
||||
std::vector<task_server> queue_tasks_deferred;
|
||||
std::vector<task_multi> queue_multitasks;
|
||||
std::condition_variable condition_tasks;
|
||||
// callback functions
|
||||
std::function<void(task_server&)> callback_new_task;
|
||||
std::function<void(task_multi&)> callback_finish_multitask;
|
||||
std::function<void(void)> callback_all_task_finished;
|
||||
|
||||
// Add a new task to the end of the queue
|
||||
int post(task_server task) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (task.id == -1) {
|
||||
task.id = id++;
|
||||
}
|
||||
queue_tasks.push_back(std::move(task));
|
||||
condition_tasks.notify_one();
|
||||
return task.id;
|
||||
}
|
||||
|
||||
// Add a new task, but defer until one slot is available
|
||||
void defer(task_server task) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
queue_tasks_deferred.push_back(std::move(task));
|
||||
}
|
||||
|
||||
// Get the next id for creating anew task
|
||||
int get_new_id() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
return id++;
|
||||
}
|
||||
|
||||
// Register function to process a new task
|
||||
void on_new_task(std::function<void(task_server&)> callback) {
|
||||
callback_new_task = callback;
|
||||
}
|
||||
|
||||
// Register function to process a multitask
|
||||
void on_finish_multitask(std::function<void(task_multi&)> callback) {
|
||||
callback_finish_multitask = callback;
|
||||
}
|
||||
|
||||
// Register the function to be called when the batch of tasks is finished
|
||||
void on_all_tasks_finished(std::function<void(void)> callback) {
|
||||
callback_all_task_finished = callback;
|
||||
}
|
||||
|
||||
// Call when the state of one slot is changed
|
||||
void notify_slot_changed() {
|
||||
// move deferred tasks back to main loop
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
for (auto & task : queue_tasks_deferred) {
|
||||
queue_tasks.push_back(std::move(task));
|
||||
}
|
||||
queue_tasks_deferred.clear();
|
||||
}
|
||||
|
||||
// Start the main loop. This call is blocking
|
||||
[[noreturn]]
|
||||
void start_loop() {
|
||||
while (true) {
|
||||
// new task arrived
|
||||
LOG_VERBOSE("have new task", {});
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (queue_tasks.empty()) {
|
||||
lock.unlock();
|
||||
break;
|
||||
}
|
||||
task_server task = queue_tasks.front();
|
||||
queue_tasks.erase(queue_tasks.begin());
|
||||
lock.unlock();
|
||||
LOG_VERBOSE("callback_new_task", {});
|
||||
callback_new_task(task);
|
||||
}
|
||||
LOG_VERBOSE("callback_all_task_finished", {});
|
||||
// process and update all the multitasks
|
||||
auto queue_iterator = queue_multitasks.begin();
|
||||
while (queue_iterator != queue_multitasks.end())
|
||||
{
|
||||
if (queue_iterator->subtasks_remaining.empty())
|
||||
{
|
||||
// all subtasks done == multitask is done
|
||||
task_multi current_multitask = *queue_iterator;
|
||||
callback_finish_multitask(current_multitask);
|
||||
// remove this multitask
|
||||
queue_iterator = queue_multitasks.erase(queue_iterator);
|
||||
}
|
||||
else
|
||||
{
|
||||
++queue_iterator;
|
||||
}
|
||||
}
|
||||
// all tasks in the current loop is finished
|
||||
callback_all_task_finished();
|
||||
}
|
||||
LOG_VERBOSE("wait for new task", {});
|
||||
// wait for new task
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (queue_tasks.empty()) {
|
||||
condition_tasks.wait(lock, [&]{
|
||||
return !queue_tasks.empty();
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// functions to manage multitasks
|
||||
//
|
||||
|
||||
// add a multitask by specifying the id of all subtask (subtask is a task_server)
|
||||
void add_multitask(int multitask_id, std::vector<int>& sub_ids)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
task_multi multi;
|
||||
multi.id = multitask_id;
|
||||
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
|
||||
queue_multitasks.push_back(multi);
|
||||
}
|
||||
|
||||
// updatethe remaining subtasks, while appending results to multitask
|
||||
void update_multitask(int multitask_id, int subtask_id, task_result& result)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
for (auto& multitask : queue_multitasks)
|
||||
{
|
||||
if (multitask.id == multitask_id)
|
||||
{
|
||||
multitask.subtasks_remaining.erase(subtask_id);
|
||||
multitask.results.push_back(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_server_response {
|
||||
typedef std::function<void(int, int, task_result&)> callback_multitask_t;
|
||||
callback_multitask_t callback_update_multitask;
|
||||
// for keeping track of all tasks waiting for the result
|
||||
std::set<int> waiting_task_ids;
|
||||
// the main result queue
|
||||
std::vector<task_result> queue_results;
|
||||
std::mutex mutex_results;
|
||||
std::condition_variable condition_results;
|
||||
|
||||
void add_waiting_task_id(int task_id) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.insert(task_id);
|
||||
}
|
||||
|
||||
void remove_waiting_task_id(int task_id) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.erase(task_id);
|
||||
}
|
||||
|
||||
// This function blocks the thread until there is a response for this task_id
|
||||
task_result recv(int task_id) {
|
||||
while (true)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
condition_results.wait(lock, [&]{
|
||||
return !queue_results.empty();
|
||||
});
|
||||
LOG_VERBOSE("condition_results unblock", {});
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||
{
|
||||
if (queue_results[i].id == task_id)
|
||||
{
|
||||
assert(queue_results[i].multitask_id == -1);
|
||||
task_result res = queue_results[i];
|
||||
queue_results.erase(queue_results.begin() + i);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// should never reach here
|
||||
}
|
||||
|
||||
// Register the function to update multitask
|
||||
void on_multitask_update(callback_multitask_t callback) {
|
||||
callback_update_multitask = callback;
|
||||
}
|
||||
|
||||
// Send a new result to a waiting task_id
|
||||
void send(task_result result) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
LOG_VERBOSE("send new result", {});
|
||||
for (auto& task_id : waiting_task_ids) {
|
||||
// LOG_TEE("waiting task id %i \n", task_id);
|
||||
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
||||
if (result.multitask_id == task_id)
|
||||
{
|
||||
LOG_VERBOSE("callback_update_multitask", {});
|
||||
callback_update_multitask(task_id, result.id, result);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (result.id == task_id)
|
||||
{
|
||||
LOG_VERBOSE("queue_results.push_back", {});
|
||||
queue_results.push_back(result);
|
||||
condition_results.notify_one();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// base64 utils (TODO: move to common in the future)
|
||||
//
|
||||
|
||||
static const std::string base64_chars =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"0123456789+/";
|
||||
|
||||
static inline bool is_base64(uint8_t c)
|
||||
{
|
||||
return (isalnum(c) || (c == '+') || (c == '/'));
|
||||
}
|
||||
|
||||
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
|
||||
{
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int in_ = 0;
|
||||
|
||||
int in_len = encoded_string.size();
|
||||
|
||||
uint8_t char_array_4[4];
|
||||
uint8_t char_array_3[3];
|
||||
|
||||
std::vector<uint8_t> ret;
|
||||
|
||||
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
|
||||
{
|
||||
char_array_4[i++] = encoded_string[in_]; in_++;
|
||||
if (i == 4)
|
||||
{
|
||||
for (i = 0; i <4; i++)
|
||||
{
|
||||
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
||||
}
|
||||
|
||||
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||
|
||||
for (i = 0; (i < 3); i++)
|
||||
{
|
||||
ret.push_back(char_array_3[i]);
|
||||
}
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (i)
|
||||
{
|
||||
for (j = i; j <4; j++)
|
||||
{
|
||||
char_array_4[j] = 0;
|
||||
}
|
||||
|
||||
for (j = 0; j <4; j++)
|
||||
{
|
||||
char_array_4[j] = base64_chars.find(char_array_4[j]);
|
||||
}
|
||||
|
||||
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||
|
||||
for (j = 0; (j < i - 1); j++)
|
||||
{
|
||||
ret.push_back(char_array_3[j]);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -29,6 +29,20 @@
|
||||
nvidia-cuda-12: "cuda12-llama-cpp"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp"
|
||||
- &ikllamacpp
|
||||
name: "ik-llama-cpp"
|
||||
alias: "ik-llama-cpp"
|
||||
license: mit
|
||||
description: |
|
||||
Fork of llama.cpp optimized for CPU performance by ikawrakow
|
||||
urls:
|
||||
- https://github.com/ikawrakow/ik_llama.cpp
|
||||
tags:
|
||||
- text-to-text
|
||||
- LLM
|
||||
- CPU
|
||||
capabilities:
|
||||
default: "cpu-ik-llama-cpp"
|
||||
- &whispercpp
|
||||
name: "whisper"
|
||||
alias: "whisper"
|
||||
@@ -897,6 +911,10 @@
|
||||
nvidia-cuda-12: "cuda12-llama-cpp-development"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-development"
|
||||
- !!merge <<: *ikllamacpp
|
||||
name: "ik-llama-cpp-development"
|
||||
capabilities:
|
||||
default: "cpu-ik-llama-cpp-development"
|
||||
- !!merge <<: *neutts
|
||||
name: "cpu-neutts"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-neutts"
|
||||
@@ -1327,6 +1345,17 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp
|
||||
## ik-llama-cpp
|
||||
- !!merge <<: *ikllamacpp
|
||||
name: "cpu-ik-llama-cpp"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ik-llama-cpp"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-cpu-ik-llama-cpp
|
||||
- !!merge <<: *ikllamacpp
|
||||
name: "cpu-ik-llama-cpp-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ik-llama-cpp"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-ik-llama-cpp
|
||||
## whisper
|
||||
- !!merge <<: *whispercpp
|
||||
name: "nvidia-l4t-arm64-whisper"
|
||||
|
||||
@@ -539,6 +539,47 @@ options:
|
||||
- [llama](https://github.com/ggerganov/llama.cpp)
|
||||
|
||||
|
||||
### ik_llama.cpp
|
||||
|
||||
[ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp) is a hard fork of `llama.cpp` by Iwan Kawrakow that focuses on superior CPU and hybrid GPU/CPU performance. It ships additional quantization types (IQK quants), custom quantization mixes, Multi-head Latent Attention (MLA) for DeepSeek models, and fine-grained tensor offload controls — particularly useful for running very large models on commodity CPU hardware.
|
||||
|
||||
{{% notice note %}}
|
||||
|
||||
The `ik-llama-cpp` backend requires a CPU with **AVX2** support. The IQK kernels are not compatible with older CPUs.
|
||||
|
||||
{{% /notice %}}
|
||||
|
||||
#### Features
|
||||
|
||||
The `ik-llama-cpp` backend supports the following features:
|
||||
- [📖 Text generation (GPT)]({{%relref "features/text-generation" %}})
|
||||
- [🧠 Embeddings]({{%relref "features/embeddings" %}})
|
||||
- IQK quantization types for better CPU inference performance
|
||||
- Multimodal models (via clip/llava)
|
||||
|
||||
#### Setup
|
||||
|
||||
The backend is distributed as a separate container image and can be installed from the LocalAI backend gallery, or specified directly in a model configuration. GGUF models loaded with this backend benefit from ik_llama.cpp's optimized CPU kernels — especially useful for MoE models and large quantized models that would otherwise be GPU-bound.
|
||||
|
||||
#### YAML configuration
|
||||
|
||||
To use the `ik-llama-cpp` backend, specify it as the backend in the YAML file:
|
||||
|
||||
```yaml
|
||||
name: my-model
|
||||
backend: ik-llama-cpp
|
||||
parameters:
|
||||
# Relative to the models path
|
||||
model: file.gguf
|
||||
```
|
||||
|
||||
The aliases `ik-llama` and `ik_llama` are also accepted.
|
||||
|
||||
#### Reference
|
||||
|
||||
- [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp)
|
||||
|
||||
|
||||
### vLLM
|
||||
|
||||
[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference.
|
||||
|
||||
@@ -19,6 +19,7 @@ LocalAI will attempt to automatically load models which are not explicitly confi
|
||||
| Backend | Description | Capability | Embeddings | Streaming | Acceleration |
|
||||
|---------|-------------|------------|------------|-----------|-------------|
|
||||
| [llama.cpp](https://github.com/ggerganov/llama.cpp) | LLM inference in C/C++. Supports LLaMA, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | GPT, Functions | yes | yes | CPU, CUDA 12/13, ROCm, Intel SYCL, Vulkan, Metal, Jetson L4T |
|
||||
| [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp) | Hard fork of llama.cpp optimized for CPU/hybrid CPU+GPU with IQK quants, custom quant mixes, and MLA for DeepSeek | GPT | yes | yes | CPU (AVX2+) |
|
||||
| [vLLM](https://github.com/vllm-project/vllm) | Fast LLM serving with PagedAttention | GPT | no | no | CUDA 12, ROCm, Intel |
|
||||
| [vLLM Omni](https://github.com/vllm-project/vllm) | Unified multimodal generation (text, image, video, audio) | Multimodal GPT | no | no | CUDA 12, ROCm |
|
||||
| [transformers](https://github.com/huggingface/transformers) | HuggingFace Transformers framework | GPT, Embeddings, Multimodal | yes | yes* | CPU, CUDA 12/13, ROCm, Intel, Metal |
|
||||
|
||||
@@ -14,12 +14,15 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
LLamaCPP = "llama-cpp"
|
||||
LLamaCPP = "llama-cpp"
|
||||
IKLLamaCPP = "ik-llama-cpp"
|
||||
)
|
||||
|
||||
var Aliases = map[string]string{
|
||||
"go-llama": LLamaCPP,
|
||||
"llama": LLamaCPP,
|
||||
"ik_llama": IKLLamaCPP,
|
||||
"ik-llama": IKLLamaCPP,
|
||||
"embedded-store": LocalStoreBackend,
|
||||
"huggingface-embeddings": TransformersBackend,
|
||||
"transformers-musicgen": TransformersBackend,
|
||||
|
||||
@@ -24,6 +24,9 @@ function inferBackendPath(item) {
|
||||
if (item.dockerfile.endsWith("rust")) {
|
||||
return `backend/rust/${item.backend}/`;
|
||||
}
|
||||
if (item.dockerfile.endsWith("ik-llama-cpp")) {
|
||||
return `backend/cpp/ik-llama-cpp/`;
|
||||
}
|
||||
if (item.dockerfile.endsWith("llama-cpp")) {
|
||||
return `backend/cpp/llama-cpp/`;
|
||||
}
|
||||
|
||||
342
tests/e2e-backends/backend_test.go
Normal file
342
tests/e2e-backends/backend_test.go
Normal file
@@ -0,0 +1,342 @@
|
||||
package e2ebackends_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
"github.com/phayes/freeport"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
)
|
||||
|
||||
// Environment variables consumed by the suite.
|
||||
//
|
||||
// Required (one of):
|
||||
//
|
||||
// BACKEND_IMAGE Docker image tag to test (e.g. local-ai-backend:llama-cpp).
|
||||
//
|
||||
// Required model source (one of):
|
||||
//
|
||||
// BACKEND_TEST_MODEL_URL HTTP(S) URL of a model file to download before the test.
|
||||
// BACKEND_TEST_MODEL_FILE Path to an already-available model file (skips download).
|
||||
//
|
||||
// Optional:
|
||||
//
|
||||
// BACKEND_TEST_CAPS Comma-separated list of capabilities to exercise.
|
||||
// Supported values: health, load, predict, stream, embeddings.
|
||||
// Defaults to "health,load,predict,stream".
|
||||
// A backend that only does embeddings would set this to
|
||||
// "health,load,embeddings"; an image/TTS backend that cannot
|
||||
// be driven by a text prompt can set it to "health,load".
|
||||
// BACKEND_TEST_PROMPT Override the prompt used by predict/stream specs.
|
||||
// BACKEND_TEST_CTX_SIZE Override the context size passed to LoadModel (default 512).
|
||||
// BACKEND_TEST_THREADS Override Threads passed to LoadModel (default 4).
|
||||
//
|
||||
// The suite is intentionally model-format-agnostic: it only ever passes the
|
||||
// file path to LoadModel, so GGUF, ONNX, safetensors, .bin etc. all work so
|
||||
// long as the backend under test accepts that format.
|
||||
const (
|
||||
capHealth = "health"
|
||||
capLoad = "load"
|
||||
capPredict = "predict"
|
||||
capStream = "stream"
|
||||
capEmbeddings = "embeddings"
|
||||
|
||||
defaultPrompt = "The capital of France is"
|
||||
streamPrompt = "Once upon a time"
|
||||
)
|
||||
|
||||
func defaultCaps() map[string]bool {
|
||||
return map[string]bool{
|
||||
capHealth: true,
|
||||
capLoad: true,
|
||||
capPredict: true,
|
||||
capStream: true,
|
||||
}
|
||||
}
|
||||
|
||||
// parseCaps reads BACKEND_TEST_CAPS and returns the enabled capability set.
|
||||
// An empty/unset value falls back to defaultCaps().
|
||||
func parseCaps() map[string]bool {
|
||||
raw := strings.TrimSpace(os.Getenv("BACKEND_TEST_CAPS"))
|
||||
if raw == "" {
|
||||
return defaultCaps()
|
||||
}
|
||||
caps := map[string]bool{}
|
||||
for _, part := range strings.Split(raw, ",") {
|
||||
part = strings.TrimSpace(strings.ToLower(part))
|
||||
if part != "" {
|
||||
caps[part] = true
|
||||
}
|
||||
}
|
||||
return caps
|
||||
}
|
||||
|
||||
var _ = Describe("Backend container", Ordered, func() {
|
||||
var (
|
||||
caps map[string]bool
|
||||
workDir string
|
||||
binaryDir string
|
||||
modelFile string
|
||||
addr string
|
||||
serverCmd *exec.Cmd
|
||||
conn *grpc.ClientConn
|
||||
client pb.BackendClient
|
||||
prompt string
|
||||
)
|
||||
|
||||
BeforeAll(func() {
|
||||
image := os.Getenv("BACKEND_IMAGE")
|
||||
Expect(image).NotTo(BeEmpty(), "BACKEND_IMAGE env var must be set (e.g. local-ai-backend:llama-cpp)")
|
||||
|
||||
modelURL := os.Getenv("BACKEND_TEST_MODEL_URL")
|
||||
modelFile = os.Getenv("BACKEND_TEST_MODEL_FILE")
|
||||
Expect(modelURL != "" || modelFile != "").To(BeTrue(),
|
||||
"one of BACKEND_TEST_MODEL_URL or BACKEND_TEST_MODEL_FILE must be set")
|
||||
|
||||
caps = parseCaps()
|
||||
GinkgoWriter.Printf("Testing image=%q with capabilities=%v\n", image, keys(caps))
|
||||
|
||||
prompt = os.Getenv("BACKEND_TEST_PROMPT")
|
||||
if prompt == "" {
|
||||
prompt = defaultPrompt
|
||||
}
|
||||
|
||||
var err error
|
||||
workDir, err = os.MkdirTemp("", "backend-e2e-*")
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
// Extract the image filesystem so we can run run.sh directly.
|
||||
binaryDir = filepath.Join(workDir, "rootfs")
|
||||
Expect(os.MkdirAll(binaryDir, 0o755)).To(Succeed())
|
||||
extractImage(image, binaryDir)
|
||||
Expect(filepath.Join(binaryDir, "run.sh")).To(BeAnExistingFile())
|
||||
|
||||
// Download the model once if not provided.
|
||||
if modelFile == "" {
|
||||
modelFile = filepath.Join(workDir, "model.bin")
|
||||
downloadFile(modelURL, modelFile)
|
||||
}
|
||||
|
||||
// Pick a free port and launch the backend.
|
||||
port, err := freeport.GetFreePort()
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
addr = fmt.Sprintf("127.0.0.1:%d", port)
|
||||
|
||||
Expect(os.Chmod(filepath.Join(binaryDir, "run.sh"), 0o755)).To(Succeed())
|
||||
// Mark any other top-level files executable (extraction may strip perms).
|
||||
entries, _ := os.ReadDir(binaryDir)
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() && !strings.HasSuffix(e.Name(), ".sh") {
|
||||
_ = os.Chmod(filepath.Join(binaryDir, e.Name()), 0o755)
|
||||
}
|
||||
}
|
||||
|
||||
serverCmd = exec.Command(filepath.Join(binaryDir, "run.sh"), "--addr="+addr)
|
||||
serverCmd.Stdout = GinkgoWriter
|
||||
serverCmd.Stderr = GinkgoWriter
|
||||
Expect(serverCmd.Start()).To(Succeed())
|
||||
|
||||
// Wait for the gRPC port to accept connections.
|
||||
Eventually(func() error {
|
||||
c, err := net.DialTimeout("tcp", addr, 500*time.Millisecond)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_ = c.Close()
|
||||
return nil
|
||||
}, 30*time.Second, 200*time.Millisecond).Should(Succeed(), "backend did not start")
|
||||
|
||||
conn, err = grpc.Dial(addr,
|
||||
grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(50*1024*1024)),
|
||||
)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
client = pb.NewBackendClient(conn)
|
||||
})
|
||||
|
||||
AfterAll(func() {
|
||||
if conn != nil {
|
||||
_ = conn.Close()
|
||||
}
|
||||
if serverCmd != nil && serverCmd.Process != nil {
|
||||
_ = serverCmd.Process.Kill()
|
||||
_, _ = serverCmd.Process.Wait()
|
||||
}
|
||||
if workDir != "" {
|
||||
_ = os.RemoveAll(workDir)
|
||||
}
|
||||
})
|
||||
|
||||
It("responds to Health", func() {
|
||||
if !caps[capHealth] {
|
||||
Skip("health capability not enabled")
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
res, err := client.Health(ctx, &pb.HealthMessage{})
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(res.GetMessage()).NotTo(BeEmpty())
|
||||
})
|
||||
|
||||
It("loads the model", func() {
|
||||
if !caps[capLoad] {
|
||||
Skip("load capability not enabled")
|
||||
}
|
||||
ctxSize := envInt32("BACKEND_TEST_CTX_SIZE", 512)
|
||||
threads := envInt32("BACKEND_TEST_THREADS", 4)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
res, err := client.LoadModel(ctx, &pb.ModelOptions{
|
||||
Model: modelFile,
|
||||
ModelFile: modelFile,
|
||||
ContextSize: ctxSize,
|
||||
Threads: threads,
|
||||
NGPULayers: 0,
|
||||
MMap: true,
|
||||
NBatch: 128,
|
||||
})
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(res.GetSuccess()).To(BeTrue(), "LoadModel failed: %s", res.GetMessage())
|
||||
})
|
||||
|
||||
It("generates output via Predict", func() {
|
||||
if !caps[capPredict] {
|
||||
Skip("predict capability not enabled")
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
|
||||
defer cancel()
|
||||
res, err := client.Predict(ctx, &pb.PredictOptions{
|
||||
Prompt: prompt,
|
||||
Tokens: 20,
|
||||
Temperature: 0.1,
|
||||
TopK: 40,
|
||||
TopP: 0.9,
|
||||
})
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(res.GetMessage()).NotTo(BeEmpty(), "Predict produced empty output")
|
||||
GinkgoWriter.Printf("Predict: %q (tokens=%d, prompt_tokens=%d)\n",
|
||||
res.GetMessage(), res.GetTokens(), res.GetPromptTokens())
|
||||
})
|
||||
|
||||
It("streams output via PredictStream", func() {
|
||||
if !caps[capStream] {
|
||||
Skip("stream capability not enabled")
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
|
||||
defer cancel()
|
||||
stream, err := client.PredictStream(ctx, &pb.PredictOptions{
|
||||
Prompt: streamPrompt,
|
||||
Tokens: 20,
|
||||
Temperature: 0.1,
|
||||
TopK: 40,
|
||||
TopP: 0.9,
|
||||
})
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
var chunks int
|
||||
var combined string
|
||||
for {
|
||||
msg, err := stream.Recv()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
if len(msg.GetMessage()) > 0 {
|
||||
chunks++
|
||||
combined += string(msg.GetMessage())
|
||||
}
|
||||
}
|
||||
Expect(chunks).To(BeNumerically(">", 0), "no stream chunks received")
|
||||
GinkgoWriter.Printf("Stream: %d chunks, combined=%q\n", chunks, combined)
|
||||
})
|
||||
|
||||
It("computes embeddings via Embedding", func() {
|
||||
if !caps[capEmbeddings] {
|
||||
Skip("embeddings capability not enabled")
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
res, err := client.Embedding(ctx, &pb.PredictOptions{
|
||||
Embeddings: prompt,
|
||||
})
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(res.GetEmbeddings()).NotTo(BeEmpty(), "Embedding returned empty vector")
|
||||
GinkgoWriter.Printf("Embedding: %d dims\n", len(res.GetEmbeddings()))
|
||||
})
|
||||
})
|
||||
|
||||
// extractImage runs `docker create` + `docker export` to materialise the image
|
||||
// rootfs into dest. Using export (not save) avoids dealing with layer tarballs.
|
||||
func extractImage(image, dest string) {
|
||||
GinkgoHelper()
|
||||
// The backend images have no default ENTRYPOINT/CMD, so docker create fails
|
||||
// unless we override one; run.sh is harmless and guaranteed to exist.
|
||||
create := exec.Command("docker", "create", "--entrypoint=/run.sh", image)
|
||||
out, err := create.CombinedOutput()
|
||||
Expect(err).NotTo(HaveOccurred(), "docker create failed: %s", string(out))
|
||||
cid := strings.TrimSpace(string(out))
|
||||
DeferCleanup(func() {
|
||||
_ = exec.Command("docker", "rm", "-f", cid).Run()
|
||||
})
|
||||
|
||||
// Pipe `docker export <cid>` into `tar -xf - -C dest`.
|
||||
exp := exec.Command("docker", "export", cid)
|
||||
expOut, err := exp.StdoutPipe()
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
exp.Stderr = GinkgoWriter
|
||||
Expect(exp.Start()).To(Succeed())
|
||||
|
||||
tar := exec.Command("tar", "-xf", "-", "-C", dest)
|
||||
tar.Stdin = expOut
|
||||
tar.Stderr = GinkgoWriter
|
||||
Expect(tar.Run()).To(Succeed())
|
||||
Expect(exp.Wait()).To(Succeed())
|
||||
}
|
||||
|
||||
// downloadFile fetches url into dest using curl -L. Used for CI convenience;
|
||||
// local runs can use BACKEND_TEST_MODEL_FILE to skip downloading.
|
||||
func downloadFile(url, dest string) {
|
||||
GinkgoHelper()
|
||||
cmd := exec.Command("curl", "-sSfL", "-o", dest, url)
|
||||
cmd.Stdout = GinkgoWriter
|
||||
cmd.Stderr = GinkgoWriter
|
||||
Expect(cmd.Run()).To(Succeed(), "failed to download %s", url)
|
||||
fi, err := os.Stat(dest)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(fi.Size()).To(BeNumerically(">", 1024), "downloaded file is suspiciously small")
|
||||
}
|
||||
|
||||
func envInt32(name string, def int32) int32 {
|
||||
raw := os.Getenv(name)
|
||||
if raw == "" {
|
||||
return def
|
||||
}
|
||||
var v int32
|
||||
_, err := fmt.Sscanf(raw, "%d", &v)
|
||||
if err != nil {
|
||||
return def
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func keys(m map[string]bool) []string {
|
||||
out := make([]string, 0, len(m))
|
||||
for k, v := range m {
|
||||
if v {
|
||||
out = append(out, k)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
24
tests/e2e-backends/suite_test.go
Normal file
24
tests/e2e-backends/suite_test.go
Normal file
@@ -0,0 +1,24 @@
|
||||
// Package e2ebackends exercises a built backend container image end-to-end over
|
||||
// its gRPC surface.
|
||||
//
|
||||
// The suite is intentionally backend-agnostic: it extracts a Docker image,
|
||||
// launches the bundled run.sh entrypoint, then drives a configurable set of
|
||||
// gRPC calls against the result. Specs are gated by capability flags so that a
|
||||
// non-LLM backend (e.g. image generation, TTS, embeddings-only) can opt in to
|
||||
// only the RPCs it implements.
|
||||
//
|
||||
// Configuration is entirely through environment variables — see backend_test.go
|
||||
// for the full list.
|
||||
package e2ebackends_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
func TestBackendE2E(t *testing.T) {
|
||||
RegisterFailHandler(Fail)
|
||||
RunSpecs(t, "Backend gRPC End-to-End Suite")
|
||||
}
|
||||
Reference in New Issue
Block a user