Compare commits

..

7 Commits

Author SHA1 Message Date
Ettore Di Giacinto
6e11f882f7 feat(turboquant.cpp): add new backend
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-03 20:57:15 +00:00
Ettore Di Giacinto
8577bdcebc Update asset links in README.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-04-03 10:24:08 +02:00
Ettore Di Giacinto
0d489c7a0d Add guided tour and update screenshots section
Updated README to include a guided tour section with links to various assets and details about agents and usage metrics.

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-04-03 10:23:03 +02:00
Ettore Di Giacinto
11dc54bda9 fix(docs): commit distribution.md
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-03 10:14:13 +02:00
Ettore Di Giacinto
7e0b73deaa fix(docs): fix broken references to distributed mode
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-03 09:46:06 +02:00
LocalAI [bot]
c0a023d13d chore: ⬆️ Update ggml-org/llama.cpp to a1cfb645307edc61a89e41557f290f441043d3c2 (#9203)
⬆️ Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-04-03 08:30:15 +02:00
Loryan Strant
0d3ae1c295 docs: Update Home Assistant integrations list (#9206)
Update Home Assistant integrations list

Signed-off-by: Loryan Strant <51473494+loryanstrant@users.noreply.github.com>
2026-04-03 08:30:00 +02:00
37 changed files with 373 additions and 92 deletions

View File

@@ -1828,6 +1828,98 @@ jobs:
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
# llama-cpp-tq (TurboQuant fork)
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto'
tag-suffix: '-cpu-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp-tq'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-llama-cpp-tq'
base-image: "ubuntu:24.04"
runs-on: 'ubuntu-24.04-arm'
ubuntu-version: '2404'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-rocm-hipblas-llama-cpp-tq'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-24.04:6.4.4"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-arm64-llama-cpp-tq'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2204'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
# Stablediffusion-ggml
- build-type: ''
cuda-major-version: ""

View File

@@ -14,6 +14,11 @@ jobs:
variable: "LLAMA_VERSION"
branch: "master"
file: "backend/cpp/llama-cpp/Makefile"
- repository: "TheTom/llama-cpp-turboquant"
variable: "LLAMA_VERSION"
branch: "master"
file: "backend/cpp/llama-cpp-tq/Makefile"
branch_suffix: "-tq"
- repository: "ggml-org/whisper.cpp"
variable: "WHISPER_CPP_VERSION"
branch: "master"
@@ -60,7 +65,7 @@ jobs:
push-to-fork: ci-forks/LocalAI
commit-message: ':arrow_up: Update ${{ matrix.repository }}'
title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
branch: "update/${{ matrix.variable }}"
branch: "update/${{ matrix.variable }}${{ matrix.branch_suffix }}"
body: ${{ steps.bump.outputs.message }}
signoff: true

1
.gitignore vendored
View File

@@ -9,6 +9,7 @@ prepare-sources
/backend/cpp/llama-cpp/llama.cpp
/backend/cpp/llama-*
!backend/cpp/llama-cpp
!backend/cpp/llama-cpp-tq
/backends
/backend-images
/result.yaml

View File

@@ -544,8 +544,9 @@ backend-images:
mkdir -p backend-images
# Backend metadata: BACKEND_NAME | DOCKERFILE_TYPE | BUILD_CONTEXT | PROGRESS_FLAG | NEEDS_BACKEND_ARG
# llama-cpp is special - uses llama-cpp Dockerfile and doesn't need BACKEND arg
# llama-cpp and forks - use llama-cpp Dockerfile
BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false
BACKEND_LLAMA_CPP_TQ = llama-cpp-tq|llama-cpp|.|false|true
# Golang backends
BACKEND_PIPER = piper|golang|.|false|true
@@ -609,6 +610,7 @@ endef
# Generate all docker-build targets
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_TQ)))
$(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
$(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
$(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))

View File

@@ -42,16 +42,38 @@ Created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
> [:book: Documentation](https://localai.io/) | [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) | [💻 Quickstart](https://localai.io/basics/getting_started/) | [🖼️ Models](https://models.localai.io/) | [❓FAQ](https://localai.io/faq/)
## Screenshots
### Chat, Model gallery
## Guided tour
https://github.com/user-attachments/assets/08cbb692-57da-48f7-963d-2e7b43883c18
### Agents
<details>
<summary>
Click to see more!
</summary>
#### User and auth
https://github.com/user-attachments/assets/228fa9ad-81a3-4d43-bfb9-31557e14a36c
#### Agents
https://github.com/user-attachments/assets/6270b331-e21d-4087-a540-6290006b381a
#### Usage metrics per user
https://github.com/user-attachments/assets/cbb03379-23b4-4e3d-bd26-d152f057007f
#### Fine-tuning and Quantization
https://github.com/user-attachments/assets/5ba4ace9-d3df-4795-b7d4-b0b404ea71ee
#### WebRTC
https://github.com/user-attachments/assets/ed88e34c-fed3-4b83-8a67-4716a9feeb7b
</details>
## Quickstart
### macOS

View File

@@ -58,7 +58,9 @@ ARG CUDA_DOCKER_ARCH
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
ARG CMAKE_ARGS
ENV CMAKE_ARGS=${CMAKE_ARGS}
ARG BACKEND=rerankers
ARG BACKEND=llama-cpp
ARG LLAMA_BACKEND_DIR=${BACKEND}
ENV LLAMA_BACKEND_DIR=${LLAMA_BACKEND_DIR}
ARG BUILD_TYPE
ENV BUILD_TYPE=${BUILD_TYPE}
ARG CUDA_MAJOR_VERSION
@@ -255,32 +257,27 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
rm -rf /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}-*-build
fi
cd /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}
if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
cd /LocalAI/backend/cpp/llama-cpp
make llama-cpp-fallback
make llama-cpp-grpc
make llama-cpp-rpc-server
make ARCH=aarch64 build-variants
else
cd /LocalAI/backend/cpp/llama-cpp
make llama-cpp-avx
make llama-cpp-avx2
make llama-cpp-avx512
make llama-cpp-fallback
make llama-cpp-grpc
make llama-cpp-rpc-server
make build-variants
fi
EOT
# Copy libraries using a script to handle architecture differences
RUN make -BC /LocalAI/backend/cpp/llama-cpp package
RUN make -BC /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR} package
FROM scratch
ARG BACKEND=llama-cpp
ARG LLAMA_BACKEND_DIR=${BACKEND}
# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./
COPY --from=builder /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}/package/. ./

View File

@@ -0,0 +1,6 @@
LLAMA_VERSION?=master
LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant
BACKEND_NAME?=llama-cpp-tq
SHARED_DIR?=$(CURDIR)/../llama-cpp
include ../llama-cpp/Makefile

View File

@@ -59,6 +59,11 @@ add_library(hw_grpc_proto
add_executable(${TARGET} grpc-server.cpp json.hpp httplib.h)
# Enable autoparser support if the header exists (not present in all llama.cpp forks)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/chat-auto-parser.h")
target_compile_definitions(${TARGET} PRIVATE HAS_AUTOPARSER)
endif()
target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})

View File

@@ -1,6 +1,10 @@
LLAMA_VERSION?=95a6ebabb277c4cc18247e7bc2a5502133caca63
LLAMA_VERSION?=a1cfb645307edc61a89e41557f290f441043d3c2
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
BACKEND_NAME?=llama-cpp
SHARED_DIR?=$(CURDIR)
GRPC_SERVER_DIR?=tools/grpc-server
SERVER_SOURCE_DIR?=tools/server
CMAKE_ARGS?=
BUILD_TYPE?=
@@ -67,6 +71,17 @@ ifeq ($(BUILD_TYPE),sycl_f32)
-DCMAKE_CXX_FLAGS="-fsycl"
endif
# Variants to build for each architecture (can be overridden by forks)
X86_64_VARIANTS ?= llama-cpp-avx llama-cpp-avx2 llama-cpp-avx512 llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
ARM64_VARIANTS ?= llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
build-variants:
ifeq ($(ARCH),aarch64)
@for v in $(ARM64_VARIANTS); do $(MAKE) $$v || exit 1; done
else
@for v in $(X86_64_VARIANTS); do $(MAKE) $$v || exit 1; done
endif
INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
@@ -90,42 +105,42 @@ else
endif
llama-cpp-avx2: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build purge
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx2-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build/grpc-server llama-cpp-avx2
llama-cpp-avx512: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build purge
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx512-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build/grpc-server llama-cpp-avx512
llama-cpp-avx: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build purge
$(info ${GREEN}I llama-cpp build info:avx${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-avx-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build/grpc-server llama-cpp-avx
llama-cpp-fallback: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build purge
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-fallback-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build/grpc-server llama-cpp-fallback
llama-cpp-grpc: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build purge
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="$(BACKEND_NAME)-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/grpc-server llama-cpp-grpc
llama-cpp-rpc-server: llama-cpp-grpc
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
llama.cpp:
mkdir -p llama.cpp
@@ -133,30 +148,30 @@ llama.cpp:
git init && \
git remote add origin $(LLAMA_REPO) && \
git fetch origin && \
git checkout -b build $(LLAMA_VERSION) && \
(git checkout -b build $(LLAMA_VERSION) || git checkout -b build origin/$(LLAMA_VERSION)) && \
git submodule update --init --recursive --depth 1 --single-branch
llama.cpp/tools/grpc-server: llama.cpp
mkdir -p llama.cpp/tools/grpc-server
bash prepare.sh
llama.cpp/$(GRPC_SERVER_DIR): llama.cpp
mkdir -p llama.cpp/$(GRPC_SERVER_DIR)
SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
rebuild:
bash prepare.sh
SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
rm -rf grpc-server
$(MAKE) grpc-server
package:
bash package.sh
bash $(SHARED_DIR)/package.sh
purge:
rm -rf llama.cpp/build
rm -rf llama.cpp/tools/grpc-server
rm -rf llama.cpp/$(GRPC_SERVER_DIR)
rm -rf grpc-server
clean: purge
rm -rf llama.cpp
grpc-server: llama.cpp llama.cpp/tools/grpc-server
grpc-server: llama.cpp llama.cpp/$(GRPC_SERVER_DIR)
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \

View File

@@ -17,7 +17,9 @@
#include "backend.pb.h"
#include "backend.grpc.pb.h"
#include "common.h"
#ifdef HAS_AUTOPARSER
#include "chat-auto-parser.h"
#endif
#include <getopt.h>
#include <grpcpp/ext/proto_server_reflection_plugin.h>
#include <grpcpp/grpcpp.h>
@@ -2665,6 +2667,7 @@ public:
response->set_rendered_template(rendered_template);
#ifdef HAS_AUTOPARSER
// Run differential template analysis to detect tool format markers
if (params_base.use_jinja) {
try {
@@ -2770,6 +2773,7 @@ public:
SRV_WRN("ModelMetadata: failed to run autoparser analysis: %s\n", e.what());
}
}
#endif
return grpc::Status::OK;
}

View File

@@ -5,14 +5,21 @@
set -e
CURDIR=$(dirname "$(realpath $0)")
REPO_ROOT="${CURDIR}/../../.."
# Use working directory (not script location) so forks that share this script work correctly
CURDIR=$(pwd)
SCRIPT_DIR=$(dirname "$(realpath $0)")
REPO_ROOT="${SCRIPT_DIR}/../../.."
# Create lib directory
mkdir -p $CURDIR/package/lib
cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
cp -rfv $CURDIR/run.sh $CURDIR/package/
# Copy run.sh — prefer local copy, fall back to shared dir (script location)
if [ -f "$CURDIR/run.sh" ]; then
cp -rfv $CURDIR/run.sh $CURDIR/package/
else
cp -rfv $SCRIPT_DIR/run.sh $CURDIR/package/
fi
# Detect architecture and copy appropriate libraries
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then

View File

@@ -1,31 +1,43 @@
#!/bin/bash
## Patches
SHARED_DIR="${SHARED_DIR:-.}"
SERVER_SOURCE_DIR="${SERVER_SOURCE_DIR:-tools/server}"
GRPC_SERVER_DIR="${GRPC_SERVER_DIR:-tools/grpc-server}"
## Apply patches from the `patches` directory
if [ -d "patches" ]; then
for patch in $(ls patches); do
echo "Applying patch $patch"
patch -d llama.cpp/ -p1 < patches/$patch
done
done
fi
set -e
for file in $(ls llama.cpp/tools/server/); do
cp -rfv llama.cpp/tools/server/$file llama.cpp/tools/grpc-server/
# Copy server source files into grpc-server build directory
for file in $(ls llama.cpp/${SERVER_SOURCE_DIR}/); do
cp -rfv llama.cpp/${SERVER_SOURCE_DIR}/$file llama.cpp/${GRPC_SERVER_DIR}/
done
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/
# Copy build files — prefer local overrides, fall back to SHARED_DIR
for f in CMakeLists.txt grpc-server.cpp; do
if [ -f "$f" ]; then
cp -r "$f" llama.cpp/${GRPC_SERVER_DIR}/
else
cp -r "$SHARED_DIR/$f" llama.cpp/${GRPC_SERVER_DIR}/
fi
done
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/${GRPC_SERVER_DIR}/
cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/${GRPC_SERVER_DIR}/
# Add grpc-server subdirectory to the parent CMakeLists.txt
PARENT_CMAKELISTS="llama.cpp/$(dirname ${GRPC_SERVER_DIR})/CMakeLists.txt"
set +e
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
if grep -q "grpc-server" "$PARENT_CMAKELISTS"; then
echo "grpc-server already added"
else
echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
echo "add_subdirectory(grpc-server)" >> "$PARENT_CMAKELISTS"
fi
set -e

View File

@@ -29,6 +29,34 @@
nvidia-cuda-12: "cuda12-llama-cpp"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp"
- &llamacpp_tq
name: "llama-cpp-tq"
alias: "llama-cpp-tq"
license: mit
description: |
TurboQuant llama.cpp fork - quantization research
urls:
- https://github.com/TheTom/llama-cpp-turboquant
tags:
- text-to-text
- LLM
- CPU
- GPU
- Metal
- CUDA
- HIP
capabilities:
default: "cpu-llama-cpp-tq"
nvidia: "cuda12-llama-cpp-tq"
intel: "intel-sycl-f16-llama-cpp-tq"
amd: "rocm-llama-cpp-tq"
metal: "metal-llama-cpp-tq"
vulkan: "vulkan-llama-cpp-tq"
nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-tq"
nvidia-cuda-13: "cuda13-llama-cpp-tq"
nvidia-cuda-12: "cuda12-llama-cpp-tq"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-tq"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
- &whispercpp
name: "whisper"
alias: "whisper"
@@ -1252,6 +1280,57 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp
# llama-cpp-tq (TurboQuant) concrete backends
- !!merge <<: *llamacpp_tq
name: "cpu-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-cpu-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda12-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda13-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "rocm-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "intel-sycl-f16-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "intel-sycl-f32-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "vulkan-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-vulkan-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "metal-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "nvidia-l4t-arm64-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq
## whisper
- !!merge <<: *whispercpp
name: "nvidia-l4t-arm64-whisper"

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "GPU acceleration"
title = "GPU Acceleration"
weight = 9
url = "/features/gpu-acceleration/"
+++

View File

@@ -27,8 +27,7 @@ LocalAI provides a comprehensive set of features for running AI models locally.
- **[Realtime API](openai-realtime/)** - Low-latency multi-modal conversations (voice+text) over WebSocket
- **[Constrained Grammars](constrained_grammars/)** - Control model output format with BNF grammars
- **[GPU Acceleration](GPU-acceleration/)** - Optimize performance with GPU support
- **[Distributed Inference](distributed_inferencing/)** - Scale inference across multiple nodes
- **[Distributed Mode](distributed-mode/)** - Horizontal scaling with PostgreSQL, NATS, and remote backend nodes
- **[Distribution](distribution/)** - Scale inference across multiple nodes (P2P federation or production distributed mode)
- **[P2P API](p2p/)** - Monitor and manage P2P worker and federated nodes
- **[Model Context Protocol (MCP)](mcp/)** - Enable agentic capabilities with MCP integration
- **[Agents](agents/)** - Autonomous AI agents with tools, knowledge base, and skills

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "🤖 Agents"
title = "Agents"
weight = 21
url = '/features/agents'
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "🔈 Audio to text"
title = "Audio to Text"
weight = 16
url = "/features/audio-to-text/"
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "🔐 Authentication & Authorization"
title = "Authentication & Authorization"
weight = 26
url = '/features/authentication'
+++

View File

@@ -1,5 +1,5 @@
---
title: "⚙️ Backends"
title: "Backends"
description: "Learn how to use, manage, and develop backends in LocalAI"
weight: 4
url: "/backends/"

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "✍️ Constrained Grammars"
title = "Constrained Grammars"
weight = 15
url = "/features/constrained_grammars/"
+++

View File

@@ -5,7 +5,7 @@ weight = 14
url = "/features/distributed-mode/"
+++
Distributed mode enables horizontal scaling of LocalAI across multiple machines using **PostgreSQL** for state and node registry, and **NATS** for real-time coordination. Unlike the [P2P/federation approach](/features/distribute/), distributed mode is designed for production deployments and Kubernetes environments where you need centralized management, health monitoring, and deterministic routing.
Distributed mode enables horizontal scaling of LocalAI across multiple machines using **PostgreSQL** for state and node registry, and **NATS** for real-time coordination. Unlike the [P2P/federation approach]({{% relref "features/distributed_inferencing" %}}), distributed mode is designed for production deployments and Kubernetes environments where you need centralized management, health monitoring, and deterministic routing.
{{% notice note %}}
Distributed mode requires authentication enabled with a **PostgreSQL** database — SQLite is not supported. This is because the node registry, job store, and other distributed state are stored in PostgreSQL tables.

View File

@@ -1,12 +1,12 @@
+++
disableToc = false
title = "🆕🖧 Distributed Inference"
title = "P2P / Federated Inference"
weight = 15
url = "/features/distribute/"
+++
{{% notice tip %}}
Looking for production-grade horizontal scaling with PostgreSQL and NATS? See [Distributed Mode](/features/distributed-mode/).
Looking for production-grade horizontal scaling with PostgreSQL and NATS? See [Distributed Mode]({{% relref "features/distributed-mode" %}}).
{{% /notice %}}
This functionality enables LocalAI to distribute inference requests across multiple worker nodes, improving efficiency and performance. Nodes are automatically discovered and connect via p2p by using a shared token which makes sure the communication is secure and private between the nodes of the network.

View File

@@ -0,0 +1,34 @@
+++
disableToc = false
title = "Distribution"
weight = 13
url = "/features/distribution/"
+++
LocalAI supports distributing inference workloads across multiple machines. There are two approaches, each suited to different use cases:
## Distributed Mode (PostgreSQL + NATS)
Production-grade horizontal scaling with centralized management. Frontends are stateless LocalAI instances behind a load balancer; workers self-register and receive backends dynamically via NATS. State lives in PostgreSQL.
**Best for:** production deployments, Kubernetes, managed infrastructure.
[Read more]({{% relref "features/distributed-mode" %}})
## P2P / Federated Inference
Peer-to-peer networking via libp2p. Share a token to form a cluster with automatic discovery — no central server required. Supports federated load balancing and worker-mode weight sharding.
**Best for:** ad-hoc clusters, community sharing, quick experimentation.
[Read more]({{% relref "features/distributed_inferencing" %}})
## Quick Comparison
| | P2P / Federation | Distributed Mode |
|---|---|---|
| **Discovery** | Automatic via libp2p token | Self-registration to frontend URL |
| **State storage** | In-memory / ledger | PostgreSQL |
| **Coordination** | Gossip protocol | NATS messaging |
| **Node management** | Automatic | REST API + WebUI |
| **Setup complexity** | Minimal (share a token) | Requires PostgreSQL + NATS |

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "🧠 Embeddings"
title = "Embeddings"
weight = 13
url = "/features/embeddings/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "🥽 GPT Vision"
title = "GPT Vision"
weight = 14
url = "/features/gpt-vision/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "🎨 Image generation"
title = "Image Generation"
weight = 12
url = "/features/image-generation/"
+++

View File

@@ -1,5 +1,5 @@
+++
title = "🔗 Model Context Protocol (MCP)"
title = "Model Context Protocol (MCP)"
weight = 20
toc = true
description = "Agentic capabilities with Model Context Protocol integration"

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "🖼️ Model gallery"
title = "Model Gallery"
weight = 18
url = '/models'
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "🔍 Object detection"
title = "Object Detection"
weight = 13
url = "/features/object-detection/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "🔥 OpenAI functions and tools"
title = "OpenAI Functions and Tools"
weight = 17
url = "/features/openai-functions/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "📈 Reranker"
title = "Reranker"
weight = 11
url = "/features/reranker/"
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "⚙️ Runtime Settings"
title = "Runtime Settings"
weight = 25
url = '/features/runtime-settings'
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "💾 Stores"
title = "Stores"
weight = 18
url = '/stores'
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "📖 Text generation (GPT)"
title = "Text Generation (GPT)"
weight = 10
url = "/features/text-generation/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "🗣 Text to audio (TTS)"
title = "Text to Audio (TTS)"
weight = 11
url = "/features/text-to-audio/"
+++

View File

@@ -119,7 +119,7 @@ For production deployments or when you need more compute, LocalAI supports distr
- **P2P federation**: Connect multiple LocalAI instances for load-balanced inference
- **Model sharding**: Split large models across multiple machines
See the **Nodes** page in the web interface or the [Distribution docs]({{% relref "features/distribute" %}}) for setup instructions.
See the **Nodes** page in the web interface or the [Distribution docs]({{% relref "features/distribution" %}}) for setup instructions.
## What's Next?

View File

@@ -72,9 +72,10 @@ Feel free to open up a Pull request (by clicking at the "Edit page" below) to ge
### Home Automation
- [hass-openai-custom-conversation](https://github.com/drndos/hass-openai-custom-conversation) — Home Assistant integration
- [ha-llmvision](https://github.com/valentinfrlch/ha-llmvision) — Home Assistant LLM Vision
- [HA-LocalAI-Monitor](https://github.com/loryanstrant/HA-LocalAI-Monitor) — Home Assistant monitoring
- [Extended OpenAI Conversation](https://github.com/jekalmin/extended_openai_conversation) — Conversation agent for Home Assistant that supports a custom OpenAI endpoint
- [LLM Vision](https://github.com/valentinfrlch/ha-llmvision) — Image & video feed analysis for Home Assistant
- [OpenAI TTS Speech Service](https://github.com/sfortis/openai_tts) - OpenAI TTS custom component for Home Assistant
- [LocalAI Monitor](https://github.com/loryanstrant/HA-LocalAI-Monitor) — Monitor & control of LocalAI from Home Assistant
- Nextcloud [integration plugin](https://apps.nextcloud.com/apps/integration_openai) and [AI assistant](https://apps.nextcloud.com/apps/assistant)
### Automation & DevOps