Compare commits

..

3 Commits

Author SHA1 Message Date
Ettore Di Giacinto
659636195c deterministic builds
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-01 19:45:31 +00:00
Ettore Di Giacinto
a7a142b651 refactor, macOS fixes
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-01 19:42:16 +00:00
Ettore Di Giacinto
e502e51d78 feat(llama.cpp): add turboquant support
This PR adds patchset from the great work of @TheTom in
https://github.com/TheTom/llama-cpp-turboquant and creates a pipeline
that updates the patches against upstream automatically.

It also creates necessary scaffolding for doing this with other patches
sources.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-01 17:57:03 +00:00
45 changed files with 270 additions and 436 deletions

View File

@@ -133,7 +133,6 @@ func getRealReadme(ctx context.Context, repository string) (string, error) {
result, err := cogito.ExecuteTools(llm, fragment,
cogito.WithIterations(3),
cogito.WithMaxAttempts(3),
cogito.DisableSinkState,
cogito.WithTools(&HFReadmeTool{client: hfapi.NewClient()}))
if err != nil {
return "", err

View File

@@ -79,20 +79,7 @@ func generateYAMLEntry(model ProcessedModel, quantization string) string {
description = cleanTextContent(description)
formattedDescription := formatTextContent(description)
// Strip name and description from config file since they are
// already present at the gallery entry level and should not
// appear under overrides.
configFileContent := modelConfig.ConfigFile
var cfgMap map[string]any
if err := yaml.Unmarshal([]byte(configFileContent), &cfgMap); err == nil {
delete(cfgMap, "name")
delete(cfgMap, "description")
if cleaned, err := yaml.Marshal(cfgMap); err == nil {
configFileContent = string(cleaned)
}
}
configFile := formatTextContent(configFileContent)
configFile := formatTextContent(modelConfig.ConfigFile)
filesYAML, _ := yaml.Marshal(modelConfig.Files)

View File

@@ -17,7 +17,7 @@ func runSyntheticMode() error {
fmt.Printf("Generating %d synthetic models for testing...\n", numModels)
var models []ProcessedModel
for range numModels {
for i := range numModels {
model := generator.GenerateProcessedModel()
models = append(models, model)
fmt.Printf("Generated synthetic model: %s\n", model.ModelID)

View File

@@ -1828,98 +1828,6 @@ jobs:
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
# llama-cpp-tq (TurboQuant fork)
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto'
tag-suffix: '-cpu-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp-tq'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-llama-cpp-tq'
base-image: "ubuntu:24.04"
runs-on: 'ubuntu-24.04-arm'
ubuntu-version: '2404'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-rocm-hipblas-llama-cpp-tq'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-24.04:6.4.4"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-arm64-llama-cpp-tq'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2204'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
# Stablediffusion-ggml
- build-type: ''
cuda-major-version: ""

View File

@@ -15,10 +15,9 @@ jobs:
branch: "master"
file: "backend/cpp/llama-cpp/Makefile"
- repository: "TheTom/llama-cpp-turboquant"
variable: "LLAMA_VERSION"
branch: "master"
file: "backend/cpp/llama-cpp-tq/Makefile"
branch_suffix: "-tq"
variable: "TURBOQUANT_VERSION"
branch: "feature/turboquant-kv-cache"
file: "backend/cpp/llama-cpp/Makefile"
- repository: "ggml-org/whisper.cpp"
variable: "WHISPER_CPP_VERSION"
branch: "master"
@@ -65,9 +64,6 @@ jobs:
push-to-fork: ci-forks/LocalAI
commit-message: ':arrow_up: Update ${{ matrix.repository }}'
title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
branch: "update/${{ matrix.variable }}${{ matrix.branch_suffix }}"
branch: "update/${{ matrix.variable }}"
body: ${{ steps.bump.outputs.message }}
signoff: true

View File

@@ -55,7 +55,7 @@ jobs:
- name: Run gallery agent
env:
#OPENAI_MODEL: ${{ secrets.OPENAI_MODEL }}
OPENAI_MODEL: Qwen3.5-2B-GGUF
OPENAI_MODE: Qwen3.5-2B-GGUF
OPENAI_BASE_URL: "http://localhost:8080"
OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
#OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}

1
.gitignore vendored
View File

@@ -9,7 +9,6 @@ prepare-sources
/backend/cpp/llama-cpp/llama.cpp
/backend/cpp/llama-*
!backend/cpp/llama-cpp
!backend/cpp/llama-cpp-tq
/backends
/backend-images
/result.yaml

View File

@@ -544,9 +544,8 @@ backend-images:
mkdir -p backend-images
# Backend metadata: BACKEND_NAME | DOCKERFILE_TYPE | BUILD_CONTEXT | PROGRESS_FLAG | NEEDS_BACKEND_ARG
# llama-cpp and forks - use llama-cpp Dockerfile
# llama-cpp is special - uses llama-cpp Dockerfile and doesn't need BACKEND arg
BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false
BACKEND_LLAMA_CPP_TQ = llama-cpp-tq|llama-cpp|.|false|true
# Golang backends
BACKEND_PIPER = piper|golang|.|false|true
@@ -610,7 +609,6 @@ endef
# Generate all docker-build targets
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_TQ)))
$(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
$(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
$(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))

View File

@@ -42,38 +42,16 @@ Created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
> [:book: Documentation](https://localai.io/) | [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) | [💻 Quickstart](https://localai.io/basics/getting_started/) | [🖼️ Models](https://models.localai.io/) | [❓FAQ](https://localai.io/faq/)
## Guided tour
## Screenshots
### Chat, Model gallery
https://github.com/user-attachments/assets/08cbb692-57da-48f7-963d-2e7b43883c18
<details>
<summary>
Click to see more!
</summary>
#### User and auth
https://github.com/user-attachments/assets/228fa9ad-81a3-4d43-bfb9-31557e14a36c
#### Agents
### Agents
https://github.com/user-attachments/assets/6270b331-e21d-4087-a540-6290006b381a
#### Usage metrics per user
https://github.com/user-attachments/assets/cbb03379-23b4-4e3d-bd26-d152f057007f
#### Fine-tuning and Quantization
https://github.com/user-attachments/assets/5ba4ace9-d3df-4795-b7d4-b0b404ea71ee
#### WebRTC
https://github.com/user-attachments/assets/ed88e34c-fed3-4b83-8a67-4716a9feeb7b
</details>
## Quickstart
### macOS

View File

@@ -58,9 +58,7 @@ ARG CUDA_DOCKER_ARCH
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
ARG CMAKE_ARGS
ENV CMAKE_ARGS=${CMAKE_ARGS}
ARG BACKEND=llama-cpp
ARG LLAMA_BACKEND_DIR=${BACKEND}
ENV LLAMA_BACKEND_DIR=${LLAMA_BACKEND_DIR}
ARG BACKEND=rerankers
ARG BUILD_TYPE
ENV BUILD_TYPE=${BUILD_TYPE}
ARG CUDA_MAJOR_VERSION
@@ -257,27 +255,32 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
rm -rf /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}-*-build
rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
fi
cd /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}
if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
make ARCH=aarch64 build-variants
cd /LocalAI/backend/cpp/llama-cpp
make llama-cpp-fallback
make llama-cpp-grpc
make llama-cpp-rpc-server
else
make build-variants
cd /LocalAI/backend/cpp/llama-cpp
make llama-cpp-avx
make llama-cpp-avx2
make llama-cpp-avx512
make llama-cpp-fallback
make llama-cpp-grpc
make llama-cpp-rpc-server
fi
EOT
# Copy libraries using a script to handle architecture differences
RUN make -BC /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR} package
RUN make -BC /LocalAI/backend/cpp/llama-cpp package
FROM scratch
ARG BACKEND=llama-cpp
ARG LLAMA_BACKEND_DIR=${BACKEND}
# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
COPY --from=builder /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}/package/. ./
COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./

View File

@@ -1,6 +0,0 @@
LLAMA_VERSION?=master
LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant
BACKEND_NAME?=llama-cpp-tq
SHARED_DIR?=$(CURDIR)/../llama-cpp
include ../llama-cpp/Makefile

View File

@@ -59,11 +59,6 @@ add_library(hw_grpc_proto
add_executable(${TARGET} grpc-server.cpp json.hpp httplib.h)
# Enable autoparser support if the header exists (not present in all llama.cpp forks)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/chat-auto-parser.h")
target_compile_definitions(${TARGET} PRIVATE HAS_AUTOPARSER)
endif()
target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})

View File

@@ -1,10 +1,8 @@
LLAMA_VERSION?=a1cfb645307edc61a89e41557f290f441043d3c2
LLAMA_VERSION?=0fcb3760b2b9a3a496ef14621a7e4dad7a8df90f
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
BACKEND_NAME?=llama-cpp
SHARED_DIR?=$(CURDIR)
GRPC_SERVER_DIR?=tools/grpc-server
SERVER_SOURCE_DIR?=tools/server
TURBOQUANT_VERSION?=8ad0f00e9a38df6c29fc10363341dde300f92ae4
CMAKE_ARGS?=
BUILD_TYPE?=
@@ -71,17 +69,6 @@ ifeq ($(BUILD_TYPE),sycl_f32)
-DCMAKE_CXX_FLAGS="-fsycl"
endif
# Variants to build for each architecture (can be overridden by forks)
X86_64_VARIANTS ?= llama-cpp-avx llama-cpp-avx2 llama-cpp-avx512 llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
ARM64_VARIANTS ?= llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
build-variants:
ifeq ($(ARCH),aarch64)
@for v in $(ARM64_VARIANTS); do $(MAKE) $$v || exit 1; done
else
@for v in $(X86_64_VARIANTS); do $(MAKE) $$v || exit 1; done
endif
INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
@@ -105,42 +92,42 @@ else
endif
llama-cpp-avx2: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx2-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build/grpc-server llama-cpp-avx2
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2
llama-cpp-avx512: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx512-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build/grpc-server llama-cpp-avx512
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512
llama-cpp-avx: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
$(info ${GREEN}I llama-cpp build info:avx${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-avx-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build/grpc-server llama-cpp-avx
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx
llama-cpp-fallback: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-fallback-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build/grpc-server llama-cpp-fallback
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
llama-cpp-grpc: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="$(BACKEND_NAME)-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/grpc-server llama-cpp-grpc
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
llama-cpp-rpc-server: llama-cpp-grpc
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
llama.cpp:
mkdir -p llama.cpp
@@ -148,30 +135,30 @@ llama.cpp:
git init && \
git remote add origin $(LLAMA_REPO) && \
git fetch origin && \
(git checkout -b build $(LLAMA_VERSION) || git checkout -b build origin/$(LLAMA_VERSION)) && \
git checkout -b build $(LLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
llama.cpp/$(GRPC_SERVER_DIR): llama.cpp
mkdir -p llama.cpp/$(GRPC_SERVER_DIR)
SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
llama.cpp/tools/grpc-server: llama.cpp
mkdir -p llama.cpp/tools/grpc-server
bash prepare.sh
rebuild:
SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
bash prepare.sh
rm -rf grpc-server
$(MAKE) grpc-server
package:
bash $(SHARED_DIR)/package.sh
bash package.sh
purge:
rm -rf llama.cpp/build
rm -rf llama.cpp/$(GRPC_SERVER_DIR)
rm -rf llama.cpp/tools/grpc-server
rm -rf grpc-server
clean: purge
rm -rf llama.cpp
grpc-server: llama.cpp llama.cpp/$(GRPC_SERVER_DIR)
grpc-server: llama.cpp llama.cpp/tools/grpc-server
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \

View File

@@ -17,9 +17,7 @@
#include "backend.pb.h"
#include "backend.grpc.pb.h"
#include "common.h"
#ifdef HAS_AUTOPARSER
#include "chat-auto-parser.h"
#endif
#include <getopt.h>
#include <grpcpp/ext/proto_server_reflection_plugin.h>
#include <grpcpp/grpcpp.h>
@@ -2667,7 +2665,6 @@ public:
response->set_rendered_template(rendered_template);
#ifdef HAS_AUTOPARSER
// Run differential template analysis to detect tool format markers
if (params_base.use_jinja) {
try {
@@ -2773,7 +2770,6 @@ public:
SRV_WRN("ModelMetadata: failed to run autoparser analysis: %s\n", e.what());
}
}
#endif
return grpc::Status::OK;
}

View File

@@ -5,21 +5,14 @@
set -e
# Use working directory (not script location) so forks that share this script work correctly
CURDIR=$(pwd)
SCRIPT_DIR=$(dirname "$(realpath $0)")
REPO_ROOT="${SCRIPT_DIR}/../../.."
CURDIR=$(dirname "$(realpath $0)")
REPO_ROOT="${CURDIR}/../../.."
# Create lib directory
mkdir -p $CURDIR/package/lib
cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
# Copy run.sh — prefer local copy, fall back to shared dir (script location)
if [ -f "$CURDIR/run.sh" ]; then
cp -rfv $CURDIR/run.sh $CURDIR/package/
else
cp -rfv $SCRIPT_DIR/run.sh $CURDIR/package/
fi
cp -rfv $CURDIR/run.sh $CURDIR/package/
# Detect architecture and copy appropriate libraries
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then

View File

@@ -0,0 +1,14 @@
# Patch sources for the llama-cpp backend.
# Each source declares a fork whose commits are extracted as patches
# and applied on top of upstream llama.cpp during the build.
# See scripts/patch_utils/apply_patches.sh for the generic patch engine.
#
# version_var: Makefile variable with the pinned fork commit SHA
# base_var: Makefile variable with the upstream base commit SHA
# Both are read from version_file (relative to backend dir) to compute the diff.
sources:
- name: turboquant
repo: https://github.com/TheTom/llama-cpp-turboquant.git
version_var: TURBOQUANT_VERSION
base_var: LLAMA_VERSION
version_file: Makefile

View File

@@ -1,43 +1,26 @@
#!/bin/bash
SHARED_DIR="${SHARED_DIR:-.}"
SERVER_SOURCE_DIR="${SERVER_SOURCE_DIR:-tools/server}"
GRPC_SERVER_DIR="${GRPC_SERVER_DIR:-tools/grpc-server}"
## Apply patches from the `patches` directory
if [ -d "patches" ]; then
for patch in $(ls patches); do
echo "Applying patch $patch"
patch -d llama.cpp/ -p1 < patches/$patch
done
fi
set -e
# Copy server source files into grpc-server build directory
for file in $(ls llama.cpp/${SERVER_SOURCE_DIR}/); do
cp -rfv llama.cpp/${SERVER_SOURCE_DIR}/$file llama.cpp/${GRPC_SERVER_DIR}/
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_ROOT="$SCRIPT_DIR/../../.."
## Apply patches from sources and/or local .patch files
"$REPO_ROOT/scripts/patch_utils/apply_patches.sh" "$SCRIPT_DIR" llama.cpp
## Copy server files into grpc-server build directory
for file in $(ls llama.cpp/tools/server/); do
cp -rfv llama.cpp/tools/server/$file llama.cpp/tools/grpc-server/
done
# Copy build files — prefer local overrides, fall back to SHARED_DIR
for f in CMakeLists.txt grpc-server.cpp; do
if [ -f "$f" ]; then
cp -r "$f" llama.cpp/${GRPC_SERVER_DIR}/
else
cp -r "$SHARED_DIR/$f" llama.cpp/${GRPC_SERVER_DIR}/
fi
done
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/${GRPC_SERVER_DIR}/
cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/${GRPC_SERVER_DIR}/
# Add grpc-server subdirectory to the parent CMakeLists.txt
PARENT_CMAKELISTS="llama.cpp/$(dirname ${GRPC_SERVER_DIR})/CMakeLists.txt"
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/
set +e
if grep -q "grpc-server" "$PARENT_CMAKELISTS"; then
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
echo "grpc-server already added"
else
echo "add_subdirectory(grpc-server)" >> "$PARENT_CMAKELISTS"
echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
fi
set -e

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=87ecb95cbc65dc8e58e3d88f4f4a59a0939796f5
STABLEDIFFUSION_GGML_VERSION?=09b12d5f6d51d862749e8e0ee8baac8f012089e2
CMAKE_ARGS+=-DGGML_MAX_NAME=128

View File

@@ -29,34 +29,6 @@
nvidia-cuda-12: "cuda12-llama-cpp"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp"
- &llamacpp_tq
name: "llama-cpp-tq"
alias: "llama-cpp-tq"
license: mit
description: |
TurboQuant llama.cpp fork - quantization research
urls:
- https://github.com/TheTom/llama-cpp-turboquant
tags:
- text-to-text
- LLM
- CPU
- GPU
- Metal
- CUDA
- HIP
capabilities:
default: "cpu-llama-cpp-tq"
nvidia: "cuda12-llama-cpp-tq"
intel: "intel-sycl-f16-llama-cpp-tq"
amd: "rocm-llama-cpp-tq"
metal: "metal-llama-cpp-tq"
vulkan: "vulkan-llama-cpp-tq"
nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-tq"
nvidia-cuda-13: "cuda13-llama-cpp-tq"
nvidia-cuda-12: "cuda12-llama-cpp-tq"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-tq"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
- &whispercpp
name: "whisper"
alias: "whisper"
@@ -1280,57 +1252,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp
# llama-cpp-tq (TurboQuant) concrete backends
- !!merge <<: *llamacpp_tq
name: "cpu-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-cpu-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda12-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda13-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "rocm-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "intel-sycl-f16-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "intel-sycl-f32-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "vulkan-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-vulkan-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "metal-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "nvidia-l4t-arm64-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq
## whisper
- !!merge <<: *whispercpp
name: "nvidia-l4t-arm64-whisper"

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "GPU Acceleration"
title = "GPU acceleration"
weight = 9
url = "/features/gpu-acceleration/"
+++

View File

@@ -27,7 +27,8 @@ LocalAI provides a comprehensive set of features for running AI models locally.
- **[Realtime API](openai-realtime/)** - Low-latency multi-modal conversations (voice+text) over WebSocket
- **[Constrained Grammars](constrained_grammars/)** - Control model output format with BNF grammars
- **[GPU Acceleration](GPU-acceleration/)** - Optimize performance with GPU support
- **[Distribution](distribution/)** - Scale inference across multiple nodes (P2P federation or production distributed mode)
- **[Distributed Inference](distributed_inferencing/)** - Scale inference across multiple nodes
- **[Distributed Mode](distributed-mode/)** - Horizontal scaling with PostgreSQL, NATS, and remote backend nodes
- **[P2P API](p2p/)** - Monitor and manage P2P worker and federated nodes
- **[Model Context Protocol (MCP)](mcp/)** - Enable agentic capabilities with MCP integration
- **[Agents](agents/)** - Autonomous AI agents with tools, knowledge base, and skills

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "Agents"
title = "🤖 Agents"
weight = 21
url = '/features/agents'
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "Audio to Text"
title = "🔈 Audio to text"
weight = 16
url = "/features/audio-to-text/"
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "Authentication & Authorization"
title = "🔐 Authentication & Authorization"
weight = 26
url = '/features/authentication'
+++

View File

@@ -1,5 +1,5 @@
---
title: "Backends"
title: "⚙️ Backends"
description: "Learn how to use, manage, and develop backends in LocalAI"
weight: 4
url: "/backends/"

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "Constrained Grammars"
title = "✍️ Constrained Grammars"
weight = 15
url = "/features/constrained_grammars/"
+++

View File

@@ -5,7 +5,7 @@ weight = 14
url = "/features/distributed-mode/"
+++
Distributed mode enables horizontal scaling of LocalAI across multiple machines using **PostgreSQL** for state and node registry, and **NATS** for real-time coordination. Unlike the [P2P/federation approach]({{% relref "features/distributed_inferencing" %}}), distributed mode is designed for production deployments and Kubernetes environments where you need centralized management, health monitoring, and deterministic routing.
Distributed mode enables horizontal scaling of LocalAI across multiple machines using **PostgreSQL** for state and node registry, and **NATS** for real-time coordination. Unlike the [P2P/federation approach](/features/distribute/), distributed mode is designed for production deployments and Kubernetes environments where you need centralized management, health monitoring, and deterministic routing.
{{% notice note %}}
Distributed mode requires authentication enabled with a **PostgreSQL** database — SQLite is not supported. This is because the node registry, job store, and other distributed state are stored in PostgreSQL tables.

View File

@@ -1,12 +1,12 @@
+++
disableToc = false
title = "P2P / Federated Inference"
title = "🆕🖧 Distributed Inference"
weight = 15
url = "/features/distribute/"
+++
{{% notice tip %}}
Looking for production-grade horizontal scaling with PostgreSQL and NATS? See [Distributed Mode]({{% relref "features/distributed-mode" %}}).
Looking for production-grade horizontal scaling with PostgreSQL and NATS? See [Distributed Mode](/features/distributed-mode/).
{{% /notice %}}
This functionality enables LocalAI to distribute inference requests across multiple worker nodes, improving efficiency and performance. Nodes are automatically discovered and connect via p2p by using a shared token which makes sure the communication is secure and private between the nodes of the network.

View File

@@ -1,34 +0,0 @@
+++
disableToc = false
title = "Distribution"
weight = 13
url = "/features/distribution/"
+++
LocalAI supports distributing inference workloads across multiple machines. There are two approaches, each suited to different use cases:
## Distributed Mode (PostgreSQL + NATS)
Production-grade horizontal scaling with centralized management. Frontends are stateless LocalAI instances behind a load balancer; workers self-register and receive backends dynamically via NATS. State lives in PostgreSQL.
**Best for:** production deployments, Kubernetes, managed infrastructure.
[Read more]({{% relref "features/distributed-mode" %}})
## P2P / Federated Inference
Peer-to-peer networking via libp2p. Share a token to form a cluster with automatic discovery — no central server required. Supports federated load balancing and worker-mode weight sharding.
**Best for:** ad-hoc clusters, community sharing, quick experimentation.
[Read more]({{% relref "features/distributed_inferencing" %}})
## Quick Comparison
| | P2P / Federation | Distributed Mode |
|---|---|---|
| **Discovery** | Automatic via libp2p token | Self-registration to frontend URL |
| **State storage** | In-memory / ledger | PostgreSQL |
| **Coordination** | Gossip protocol | NATS messaging |
| **Node management** | Automatic | REST API + WebUI |
| **Setup complexity** | Minimal (share a token) | Requires PostgreSQL + NATS |

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "Embeddings"
title = "🧠 Embeddings"
weight = 13
url = "/features/embeddings/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "GPT Vision"
title = "🥽 GPT Vision"
weight = 14
url = "/features/gpt-vision/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "Image Generation"
title = "🎨 Image generation"
weight = 12
url = "/features/image-generation/"
+++

View File

@@ -1,5 +1,5 @@
+++
title = "Model Context Protocol (MCP)"
title = "🔗 Model Context Protocol (MCP)"
weight = 20
toc = true
description = "Agentic capabilities with Model Context Protocol integration"

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "Model Gallery"
title = "🖼️ Model gallery"
weight = 18
url = '/models'
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "Object Detection"
title = "🔍 Object detection"
weight = 13
url = "/features/object-detection/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "OpenAI Functions and Tools"
title = "🔥 OpenAI functions and tools"
weight = 17
url = "/features/openai-functions/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "Reranker"
title = "📈 Reranker"
weight = 11
url = "/features/reranker/"
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "Runtime Settings"
title = "⚙️ Runtime Settings"
weight = 25
url = '/features/runtime-settings'
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "Stores"
title = "💾 Stores"
weight = 18
url = '/stores'
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "Text Generation (GPT)"
title = "📖 Text generation (GPT)"
weight = 10
url = "/features/text-generation/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "Text to Audio (TTS)"
title = "🗣 Text to audio (TTS)"
weight = 11
url = "/features/text-to-audio/"
+++

View File

@@ -119,7 +119,7 @@ For production deployments or when you need more compute, LocalAI supports distr
- **P2P federation**: Connect multiple LocalAI instances for load-balanced inference
- **Model sharding**: Split large models across multiple machines
See the **Nodes** page in the web interface or the [Distribution docs]({{% relref "features/distribution" %}}) for setup instructions.
See the **Nodes** page in the web interface or the [Distribution docs]({{% relref "features/distribute" %}}) for setup instructions.
## What's Next?

View File

@@ -72,10 +72,9 @@ Feel free to open up a Pull request (by clicking at the "Edit page" below) to ge
### Home Automation
- [Extended OpenAI Conversation](https://github.com/jekalmin/extended_openai_conversation) — Conversation agent for Home Assistant that supports a custom OpenAI endpoint
- [LLM Vision](https://github.com/valentinfrlch/ha-llmvision) — Image & video feed analysis for Home Assistant
- [OpenAI TTS Speech Service](https://github.com/sfortis/openai_tts) - OpenAI TTS custom component for Home Assistant
- [LocalAI Monitor](https://github.com/loryanstrant/HA-LocalAI-Monitor) — Monitor & control of LocalAI from Home Assistant
- [hass-openai-custom-conversation](https://github.com/drndos/hass-openai-custom-conversation) — Home Assistant integration
- [ha-llmvision](https://github.com/valentinfrlch/ha-llmvision) — Home Assistant LLM Vision
- [HA-LocalAI-Monitor](https://github.com/loryanstrant/HA-LocalAI-Monitor) — Home Assistant monitoring
- Nextcloud [integration plugin](https://apps.nextcloud.com/apps/integration_openai) and [AI assistant](https://apps.nextcloud.com/apps/assistant)
### Automation & DevOps

View File

@@ -1,38 +1,4 @@
---
- name: "qwen3.5-35b-a3b-apex"
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:
- https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF
description: |
Describe the model in a clear and concise way that can be shared in a model gallery.
overrides:
backend: llama-cpp
function:
automatic_tool_parsing_fallback: true
grammar:
disable: true
known_usecases:
- chat
mmproj: llama-cpp/mmproj/Qwen3.5-35B-A3B-APEX-GGUF/mmproj-F16.gguf
options:
- use_jinja:true
parameters:
min_p: 0
model: llama-cpp/models/Qwen3.5-35B-A3B-APEX-GGUF/Qwen3.5-35B-A3B-APEX-Quality.gguf
presence_penalty: 1.5
repeat_penalty: 1
temperature: 0.7
top_k: 20
top_p: 0.8
template:
use_tokenizer_template: true
files:
- filename: llama-cpp/mmproj/Qwen3.5-35B-A3B-APEX-GGUF/mmproj-F16.gguf
sha256: a516ab92e8240da4734d68352bdfba84c16e830ee40010b8fac80d69c77272ff
uri: https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF/resolve/main/mmproj-F16.gguf
- filename: llama-cpp/models/Qwen3.5-35B-A3B-APEX-GGUF/Qwen3.5-35B-A3B-APEX-Quality.gguf
sha256: 50887b60c77ee5c95bc3657814ae993abcab7b2d71868b9af1e84d6badd09a57
uri: https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF/resolve/main/Qwen3.5-35B-A3B-APEX-Quality.gguf
- name: "qwen_qwen3.5-35b-a3b"
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:

View File

@@ -0,0 +1,151 @@
#!/bin/bash
# apply_patches.sh — Generic patch fetcher and applier for any backend.
#
# Usage: ./apply_patches.sh <source-dir> <target-dir>
#
# <source-dir> Directory containing a patches/ folder (with optional sources.yaml)
# <target-dir> The cloned upstream repo to patch (e.g., llama.cpp/)
#
# Behavior (idempotent):
# 1. If patches/sources.yaml exists and yq is available, for each source:
# - If patches/<name>/ already has .patch files: skip fetching (vendored)
# - Otherwise: clone the fork at a pinned SHA, diff against the pinned
# upstream SHA, and generate patches
# 2. Apply all patches (skips already-applied ones)
# 3. Fails fast on any patch application error
#
# sources.yaml fields:
# name — subdirectory name for this source's patches
# repo — fork git URL
# version_var — Makefile variable holding the pinned fork commit SHA
# base_var — Makefile variable holding the pinned upstream commit SHA
# version_file — Makefile path (relative to backend dir)
set -e
# Use /tmp for patch temp files to avoid macOS long-path issues
export TMPDIR="${TMPDIR_OVERRIDE:-/tmp}"
read_makefile_var() {
grep -m1 "^${1}?=" "$2" | cut -d'=' -f2
}
apply_one_patch() {
local target_dir="$1"
local patch_file="$2"
local label="$3"
if patch -d "$target_dir" -p1 --reverse --dry-run < "$patch_file" >/dev/null 2>&1; then
echo " Already applied, skipping: $label"
return 0
fi
echo " Applying: $label"
patch -d "$target_dir" -p1 --forward < "$patch_file" || { echo "FAILED: $patch_file"; exit 1; }
}
apply_patches() {
local SOURCE_DIR="$(cd "$1" && pwd)"
local TARGET_DIR="$2"
local PATCHES_DIR="$SOURCE_DIR/patches"
if [ ! -d "$PATCHES_DIR" ]; then
return 0
fi
# Phase 1: Generate missing patches from fork sources
if [ -f "$PATCHES_DIR/sources.yaml" ] && command -v yq &>/dev/null; then
local SOURCE_COUNT
SOURCE_COUNT=$(yq '.sources | length' "$PATCHES_DIR/sources.yaml")
for i in $(seq 0 $((SOURCE_COUNT - 1))); do
local NAME REPO VERSION_VAR BASE_VAR VERSION_FILE
NAME=$(yq ".sources[$i].name" "$PATCHES_DIR/sources.yaml")
REPO=$(yq ".sources[$i].repo" "$PATCHES_DIR/sources.yaml")
VERSION_VAR=$(yq ".sources[$i].version_var" "$PATCHES_DIR/sources.yaml")
BASE_VAR=$(yq ".sources[$i].base_var" "$PATCHES_DIR/sources.yaml")
VERSION_FILE=$(yq ".sources[$i].version_file" "$PATCHES_DIR/sources.yaml")
local MAKEFILE="$SOURCE_DIR/$VERSION_FILE"
local FORK_SHA BASE_SHA
FORK_SHA=$(read_makefile_var "$VERSION_VAR" "$MAKEFILE")
BASE_SHA=$(read_makefile_var "$BASE_VAR" "$MAKEFILE")
if [ -z "$FORK_SHA" ] || [ -z "$BASE_SHA" ]; then
echo "WARNING: Could not read $VERSION_VAR or $BASE_VAR from $MAKEFILE — skipping '$NAME'"
continue
fi
local SOURCE_PATCH_DIR="$PATCHES_DIR/$NAME"
local EXISTING
EXISTING=$(ls "$SOURCE_PATCH_DIR"/*.patch 2>/dev/null | wc -l)
if [ "$EXISTING" -gt 0 ]; then
echo "Patches [$NAME]: $EXISTING patches already present — skipping fetch."
else
echo "Patches [$NAME]: generating from $REPO"
echo " base (upstream): ${BASE_SHA:0:12}"
echo " head (fork): ${FORK_SHA:0:12}"
local TMPDIR_CLONE
TMPDIR_CLONE=$(mktemp -d)
if git clone "$REPO" "$TMPDIR_CLONE/fork" 2>&1; then
cd "$TMPDIR_CLONE/fork"
# Fetch the upstream base commit (may not be in the fork's history)
git fetch origin "$FORK_SHA" 2>&1 || true
git checkout "$FORK_SHA" 2>&1
# We need the base commit in the history to compute the diff.
# If the fork is a real GitHub fork, it shares history with upstream.
# Otherwise, fetch it explicitly.
if ! git cat-file -e "$BASE_SHA" 2>/dev/null; then
echo " Base commit not in fork history — fetching from upstream"
local UPSTREAM_URL
# Derive upstream URL from base_var context or use llama.cpp default
UPSTREAM_URL=$(yq ".sources[$i].upstream_repo // \"\"" "$PATCHES_DIR/sources.yaml")
if [ -n "$UPSTREAM_URL" ] && [ "$UPSTREAM_URL" != "null" ]; then
git remote add upstream "$UPSTREAM_URL" 2>/dev/null || true
git fetch upstream 2>&1
fi
fi
local PATCH_COUNT
PATCH_COUNT=$(git rev-list --count "$BASE_SHA".."$FORK_SHA" 2>/dev/null || echo "0")
echo " $PATCH_COUNT commits in diff"
if [ "$PATCH_COUNT" -gt 0 ]; then
mkdir -p "$SOURCE_PATCH_DIR"
git format-patch "$BASE_SHA".."$FORK_SHA" -o "$SOURCE_PATCH_DIR/" >/dev/null 2>&1
echo " Generated $PATCH_COUNT patches in patches/$NAME/"
fi
cd "$SOURCE_DIR"
else
echo "WARNING: Failed to clone $REPO — skipping source '$NAME'"
fi
rm -rf "$TMPDIR_CLONE"
fi
done
elif [ -f "$PATCHES_DIR/sources.yaml" ]; then
echo "WARNING: yq not found — skipping source-based patch generation."
fi
# Phase 2: Apply patches (subdirectories first, then top-level)
for source_dir in $(find "$PATCHES_DIR" -mindepth 1 -maxdepth 1 -type d | sort); do
for p in $(ls "$source_dir"/*.patch 2>/dev/null | sort); do
apply_one_patch "$TARGET_DIR" "$p" "$(basename "$source_dir")/$(basename "$p")"
done
done
for p in $(ls "$PATCHES_DIR"/*.patch 2>/dev/null | sort); do
apply_one_patch "$TARGET_DIR" "$p" "$(basename "$p")"
done
}
# Run with arguments
if [ $# -lt 2 ]; then
echo "Usage: $0 <source-dir> <target-dir>"
exit 1
fi
apply_patches "$1" "$2"