Compare commits

..

11 Commits

Author SHA1 Message Date
Ettore Di Giacinto
6e11f882f7 feat(turboquant.cpp): add new backend
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-03 20:57:15 +00:00
Ettore Di Giacinto
8577bdcebc Update asset links in README.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-04-03 10:24:08 +02:00
Ettore Di Giacinto
0d489c7a0d Add guided tour and update screenshots section
Updated README to include a guided tour section with links to various assets and details about agents and usage metrics.

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-04-03 10:23:03 +02:00
Ettore Di Giacinto
11dc54bda9 fix(docs): commit distribution.md
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-03 10:14:13 +02:00
Ettore Di Giacinto
7e0b73deaa fix(docs): fix broken references to distributed mode
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-03 09:46:06 +02:00
LocalAI [bot]
c0a023d13d chore: ⬆️ Update ggml-org/llama.cpp to a1cfb645307edc61a89e41557f290f441043d3c2 (#9203)
⬆️ Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-04-03 08:30:15 +02:00
Loryan Strant
0d3ae1c295 docs: Update Home Assistant integrations list (#9206)
Update Home Assistant integrations list

Signed-off-by: Loryan Strant <51473494+loryanstrant@users.noreply.github.com>
2026-04-03 08:30:00 +02:00
LocalAI [bot]
e9f10f2f50 chore(model gallery): 🤖 add 1 new models via gallery agent (#9202)
chore(model gallery): 🤖 add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-04-02 21:22:19 +02:00
Ettore Di Giacinto
b95b0b72ff chore(ci): fix gallery agent
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-02 18:02:18 +00:00
LocalAI [bot]
26f1b94f4d chore: ⬆️ Update ggml-org/llama.cpp to 95a6ebabb277c4cc18247e7bc2a5502133caca63 (#9199)
⬆️ Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-04-02 08:53:16 +02:00
LocalAI [bot]
2d40725ca2 chore: ⬆️ Update leejet/stable-diffusion.cpp to 87ecb95cbc65dc8e58e3d88f4f4a59a0939796f5 (#9200)
⬆️ Update leejet/stable-diffusion.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-04-02 08:53:04 +02:00
45 changed files with 436 additions and 270 deletions

View File

@@ -133,6 +133,7 @@ func getRealReadme(ctx context.Context, repository string) (string, error) {
result, err := cogito.ExecuteTools(llm, fragment,
cogito.WithIterations(3),
cogito.WithMaxAttempts(3),
cogito.DisableSinkState,
cogito.WithTools(&HFReadmeTool{client: hfapi.NewClient()}))
if err != nil {
return "", err

View File

@@ -79,7 +79,20 @@ func generateYAMLEntry(model ProcessedModel, quantization string) string {
description = cleanTextContent(description)
formattedDescription := formatTextContent(description)
configFile := formatTextContent(modelConfig.ConfigFile)
// Strip name and description from config file since they are
// already present at the gallery entry level and should not
// appear under overrides.
configFileContent := modelConfig.ConfigFile
var cfgMap map[string]any
if err := yaml.Unmarshal([]byte(configFileContent), &cfgMap); err == nil {
delete(cfgMap, "name")
delete(cfgMap, "description")
if cleaned, err := yaml.Marshal(cfgMap); err == nil {
configFileContent = string(cleaned)
}
}
configFile := formatTextContent(configFileContent)
filesYAML, _ := yaml.Marshal(modelConfig.Files)

View File

@@ -17,7 +17,7 @@ func runSyntheticMode() error {
fmt.Printf("Generating %d synthetic models for testing...\n", numModels)
var models []ProcessedModel
for i := range numModels {
for range numModels {
model := generator.GenerateProcessedModel()
models = append(models, model)
fmt.Printf("Generated synthetic model: %s\n", model.ModelID)

View File

@@ -1828,6 +1828,98 @@ jobs:
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
# llama-cpp-tq (TurboQuant fork)
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto'
tag-suffix: '-cpu-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp-tq'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-llama-cpp-tq'
base-image: "ubuntu:24.04"
runs-on: 'ubuntu-24.04-arm'
ubuntu-version: '2404'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-rocm-hipblas-llama-cpp-tq'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-24.04:6.4.4"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-arm64-llama-cpp-tq'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2204'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
# Stablediffusion-ggml
- build-type: ''
cuda-major-version: ""

View File

@@ -15,9 +15,10 @@ jobs:
branch: "master"
file: "backend/cpp/llama-cpp/Makefile"
- repository: "TheTom/llama-cpp-turboquant"
variable: "TURBOQUANT_VERSION"
branch: "feature/turboquant-kv-cache"
file: "backend/cpp/llama-cpp/Makefile"
variable: "LLAMA_VERSION"
branch: "master"
file: "backend/cpp/llama-cpp-tq/Makefile"
branch_suffix: "-tq"
- repository: "ggml-org/whisper.cpp"
variable: "WHISPER_CPP_VERSION"
branch: "master"
@@ -64,6 +65,9 @@ jobs:
push-to-fork: ci-forks/LocalAI
commit-message: ':arrow_up: Update ${{ matrix.repository }}'
title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
branch: "update/${{ matrix.variable }}"
branch: "update/${{ matrix.variable }}${{ matrix.branch_suffix }}"
body: ${{ steps.bump.outputs.message }}
signoff: true

View File

@@ -55,7 +55,7 @@ jobs:
- name: Run gallery agent
env:
#OPENAI_MODEL: ${{ secrets.OPENAI_MODEL }}
OPENAI_MODE: Qwen3.5-2B-GGUF
OPENAI_MODEL: Qwen3.5-2B-GGUF
OPENAI_BASE_URL: "http://localhost:8080"
OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
#OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}

1
.gitignore vendored
View File

@@ -9,6 +9,7 @@ prepare-sources
/backend/cpp/llama-cpp/llama.cpp
/backend/cpp/llama-*
!backend/cpp/llama-cpp
!backend/cpp/llama-cpp-tq
/backends
/backend-images
/result.yaml

View File

@@ -544,8 +544,9 @@ backend-images:
mkdir -p backend-images
# Backend metadata: BACKEND_NAME | DOCKERFILE_TYPE | BUILD_CONTEXT | PROGRESS_FLAG | NEEDS_BACKEND_ARG
# llama-cpp is special - uses llama-cpp Dockerfile and doesn't need BACKEND arg
# llama-cpp and forks - use llama-cpp Dockerfile
BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false
BACKEND_LLAMA_CPP_TQ = llama-cpp-tq|llama-cpp|.|false|true
# Golang backends
BACKEND_PIPER = piper|golang|.|false|true
@@ -609,6 +610,7 @@ endef
# Generate all docker-build targets
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_TQ)))
$(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
$(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
$(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))

View File

@@ -42,16 +42,38 @@ Created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
> [:book: Documentation](https://localai.io/) | [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) | [💻 Quickstart](https://localai.io/basics/getting_started/) | [🖼️ Models](https://models.localai.io/) | [❓FAQ](https://localai.io/faq/)
## Screenshots
### Chat, Model gallery
## Guided tour
https://github.com/user-attachments/assets/08cbb692-57da-48f7-963d-2e7b43883c18
### Agents
<details>
<summary>
Click to see more!
</summary>
#### User and auth
https://github.com/user-attachments/assets/228fa9ad-81a3-4d43-bfb9-31557e14a36c
#### Agents
https://github.com/user-attachments/assets/6270b331-e21d-4087-a540-6290006b381a
#### Usage metrics per user
https://github.com/user-attachments/assets/cbb03379-23b4-4e3d-bd26-d152f057007f
#### Fine-tuning and Quantization
https://github.com/user-attachments/assets/5ba4ace9-d3df-4795-b7d4-b0b404ea71ee
#### WebRTC
https://github.com/user-attachments/assets/ed88e34c-fed3-4b83-8a67-4716a9feeb7b
</details>
## Quickstart
### macOS

View File

@@ -58,7 +58,9 @@ ARG CUDA_DOCKER_ARCH
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
ARG CMAKE_ARGS
ENV CMAKE_ARGS=${CMAKE_ARGS}
ARG BACKEND=rerankers
ARG BACKEND=llama-cpp
ARG LLAMA_BACKEND_DIR=${BACKEND}
ENV LLAMA_BACKEND_DIR=${LLAMA_BACKEND_DIR}
ARG BUILD_TYPE
ENV BUILD_TYPE=${BUILD_TYPE}
ARG CUDA_MAJOR_VERSION
@@ -255,32 +257,27 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
rm -rf /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}-*-build
fi
cd /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}
if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
cd /LocalAI/backend/cpp/llama-cpp
make llama-cpp-fallback
make llama-cpp-grpc
make llama-cpp-rpc-server
make ARCH=aarch64 build-variants
else
cd /LocalAI/backend/cpp/llama-cpp
make llama-cpp-avx
make llama-cpp-avx2
make llama-cpp-avx512
make llama-cpp-fallback
make llama-cpp-grpc
make llama-cpp-rpc-server
make build-variants
fi
EOT
# Copy libraries using a script to handle architecture differences
RUN make -BC /LocalAI/backend/cpp/llama-cpp package
RUN make -BC /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR} package
FROM scratch
ARG BACKEND=llama-cpp
ARG LLAMA_BACKEND_DIR=${BACKEND}
# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./
COPY --from=builder /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}/package/. ./

View File

@@ -0,0 +1,6 @@
LLAMA_VERSION?=master
LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant
BACKEND_NAME?=llama-cpp-tq
SHARED_DIR?=$(CURDIR)/../llama-cpp
include ../llama-cpp/Makefile

View File

@@ -59,6 +59,11 @@ add_library(hw_grpc_proto
add_executable(${TARGET} grpc-server.cpp json.hpp httplib.h)
# Enable autoparser support if the header exists (not present in all llama.cpp forks)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/chat-auto-parser.h")
target_compile_definitions(${TARGET} PRIVATE HAS_AUTOPARSER)
endif()
target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})

View File

@@ -1,8 +1,10 @@
LLAMA_VERSION?=0fcb3760b2b9a3a496ef14621a7e4dad7a8df90f
LLAMA_VERSION?=a1cfb645307edc61a89e41557f290f441043d3c2
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
TURBOQUANT_VERSION?=8ad0f00e9a38df6c29fc10363341dde300f92ae4
BACKEND_NAME?=llama-cpp
SHARED_DIR?=$(CURDIR)
GRPC_SERVER_DIR?=tools/grpc-server
SERVER_SOURCE_DIR?=tools/server
CMAKE_ARGS?=
BUILD_TYPE?=
@@ -69,6 +71,17 @@ ifeq ($(BUILD_TYPE),sycl_f32)
-DCMAKE_CXX_FLAGS="-fsycl"
endif
# Variants to build for each architecture (can be overridden by forks)
X86_64_VARIANTS ?= llama-cpp-avx llama-cpp-avx2 llama-cpp-avx512 llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
ARM64_VARIANTS ?= llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
build-variants:
ifeq ($(ARCH),aarch64)
@for v in $(ARM64_VARIANTS); do $(MAKE) $$v || exit 1; done
else
@for v in $(X86_64_VARIANTS); do $(MAKE) $$v || exit 1; done
endif
INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
@@ -92,42 +105,42 @@ else
endif
llama-cpp-avx2: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build purge
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx2-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build/grpc-server llama-cpp-avx2
llama-cpp-avx512: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build purge
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx512-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build/grpc-server llama-cpp-avx512
llama-cpp-avx: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build purge
$(info ${GREEN}I llama-cpp build info:avx${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-avx-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build/grpc-server llama-cpp-avx
llama-cpp-fallback: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build purge
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-fallback-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build/grpc-server llama-cpp-fallback
llama-cpp-grpc: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build purge
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="$(BACKEND_NAME)-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/grpc-server llama-cpp-grpc
llama-cpp-rpc-server: llama-cpp-grpc
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
llama.cpp:
mkdir -p llama.cpp
@@ -135,30 +148,30 @@ llama.cpp:
git init && \
git remote add origin $(LLAMA_REPO) && \
git fetch origin && \
git checkout -b build $(LLAMA_VERSION) && \
(git checkout -b build $(LLAMA_VERSION) || git checkout -b build origin/$(LLAMA_VERSION)) && \
git submodule update --init --recursive --depth 1 --single-branch
llama.cpp/tools/grpc-server: llama.cpp
mkdir -p llama.cpp/tools/grpc-server
bash prepare.sh
llama.cpp/$(GRPC_SERVER_DIR): llama.cpp
mkdir -p llama.cpp/$(GRPC_SERVER_DIR)
SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
rebuild:
bash prepare.sh
SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
rm -rf grpc-server
$(MAKE) grpc-server
package:
bash package.sh
bash $(SHARED_DIR)/package.sh
purge:
rm -rf llama.cpp/build
rm -rf llama.cpp/tools/grpc-server
rm -rf llama.cpp/$(GRPC_SERVER_DIR)
rm -rf grpc-server
clean: purge
rm -rf llama.cpp
grpc-server: llama.cpp llama.cpp/tools/grpc-server
grpc-server: llama.cpp llama.cpp/$(GRPC_SERVER_DIR)
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \

View File

@@ -17,7 +17,9 @@
#include "backend.pb.h"
#include "backend.grpc.pb.h"
#include "common.h"
#ifdef HAS_AUTOPARSER
#include "chat-auto-parser.h"
#endif
#include <getopt.h>
#include <grpcpp/ext/proto_server_reflection_plugin.h>
#include <grpcpp/grpcpp.h>
@@ -2665,6 +2667,7 @@ public:
response->set_rendered_template(rendered_template);
#ifdef HAS_AUTOPARSER
// Run differential template analysis to detect tool format markers
if (params_base.use_jinja) {
try {
@@ -2770,6 +2773,7 @@ public:
SRV_WRN("ModelMetadata: failed to run autoparser analysis: %s\n", e.what());
}
}
#endif
return grpc::Status::OK;
}

View File

@@ -5,14 +5,21 @@
set -e
CURDIR=$(dirname "$(realpath $0)")
REPO_ROOT="${CURDIR}/../../.."
# Use working directory (not script location) so forks that share this script work correctly
CURDIR=$(pwd)
SCRIPT_DIR=$(dirname "$(realpath $0)")
REPO_ROOT="${SCRIPT_DIR}/../../.."
# Create lib directory
mkdir -p $CURDIR/package/lib
cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
cp -rfv $CURDIR/run.sh $CURDIR/package/
# Copy run.sh — prefer local copy, fall back to shared dir (script location)
if [ -f "$CURDIR/run.sh" ]; then
cp -rfv $CURDIR/run.sh $CURDIR/package/
else
cp -rfv $SCRIPT_DIR/run.sh $CURDIR/package/
fi
# Detect architecture and copy appropriate libraries
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then

View File

@@ -1,14 +0,0 @@
# Patch sources for the llama-cpp backend.
# Each source declares a fork whose commits are extracted as patches
# and applied on top of upstream llama.cpp during the build.
# See scripts/patch_utils/apply_patches.sh for the generic patch engine.
#
# version_var: Makefile variable with the pinned fork commit SHA
# base_var: Makefile variable with the upstream base commit SHA
# Both are read from version_file (relative to backend dir) to compute the diff.
sources:
- name: turboquant
repo: https://github.com/TheTom/llama-cpp-turboquant.git
version_var: TURBOQUANT_VERSION
base_var: LLAMA_VERSION
version_file: Makefile

View File

@@ -1,26 +1,43 @@
#!/bin/bash
SHARED_DIR="${SHARED_DIR:-.}"
SERVER_SOURCE_DIR="${SERVER_SOURCE_DIR:-tools/server}"
GRPC_SERVER_DIR="${GRPC_SERVER_DIR:-tools/grpc-server}"
## Apply patches from the `patches` directory
if [ -d "patches" ]; then
for patch in $(ls patches); do
echo "Applying patch $patch"
patch -d llama.cpp/ -p1 < patches/$patch
done
fi
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_ROOT="$SCRIPT_DIR/../../.."
## Apply patches from sources and/or local .patch files
"$REPO_ROOT/scripts/patch_utils/apply_patches.sh" "$SCRIPT_DIR" llama.cpp
## Copy server files into grpc-server build directory
for file in $(ls llama.cpp/tools/server/); do
cp -rfv llama.cpp/tools/server/$file llama.cpp/tools/grpc-server/
# Copy server source files into grpc-server build directory
for file in $(ls llama.cpp/${SERVER_SOURCE_DIR}/); do
cp -rfv llama.cpp/${SERVER_SOURCE_DIR}/$file llama.cpp/${GRPC_SERVER_DIR}/
done
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/
# Copy build files — prefer local overrides, fall back to SHARED_DIR
for f in CMakeLists.txt grpc-server.cpp; do
if [ -f "$f" ]; then
cp -r "$f" llama.cpp/${GRPC_SERVER_DIR}/
else
cp -r "$SHARED_DIR/$f" llama.cpp/${GRPC_SERVER_DIR}/
fi
done
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/${GRPC_SERVER_DIR}/
cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/${GRPC_SERVER_DIR}/
# Add grpc-server subdirectory to the parent CMakeLists.txt
PARENT_CMAKELISTS="llama.cpp/$(dirname ${GRPC_SERVER_DIR})/CMakeLists.txt"
set +e
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
if grep -q "grpc-server" "$PARENT_CMAKELISTS"; then
echo "grpc-server already added"
else
echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
echo "add_subdirectory(grpc-server)" >> "$PARENT_CMAKELISTS"
fi
set -e

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=09b12d5f6d51d862749e8e0ee8baac8f012089e2
STABLEDIFFUSION_GGML_VERSION?=87ecb95cbc65dc8e58e3d88f4f4a59a0939796f5
CMAKE_ARGS+=-DGGML_MAX_NAME=128

View File

@@ -29,6 +29,34 @@
nvidia-cuda-12: "cuda12-llama-cpp"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp"
- &llamacpp_tq
name: "llama-cpp-tq"
alias: "llama-cpp-tq"
license: mit
description: |
TurboQuant llama.cpp fork - quantization research
urls:
- https://github.com/TheTom/llama-cpp-turboquant
tags:
- text-to-text
- LLM
- CPU
- GPU
- Metal
- CUDA
- HIP
capabilities:
default: "cpu-llama-cpp-tq"
nvidia: "cuda12-llama-cpp-tq"
intel: "intel-sycl-f16-llama-cpp-tq"
amd: "rocm-llama-cpp-tq"
metal: "metal-llama-cpp-tq"
vulkan: "vulkan-llama-cpp-tq"
nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-tq"
nvidia-cuda-13: "cuda13-llama-cpp-tq"
nvidia-cuda-12: "cuda12-llama-cpp-tq"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-tq"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
- &whispercpp
name: "whisper"
alias: "whisper"
@@ -1252,6 +1280,57 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp
# llama-cpp-tq (TurboQuant) concrete backends
- !!merge <<: *llamacpp_tq
name: "cpu-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-cpu-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda12-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda13-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "rocm-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "intel-sycl-f16-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "intel-sycl-f32-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "vulkan-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-gpu-vulkan-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "metal-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "nvidia-l4t-arm64-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq
- !!merge <<: *llamacpp_tq
name: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq
## whisper
- !!merge <<: *whispercpp
name: "nvidia-l4t-arm64-whisper"

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "GPU acceleration"
title = "GPU Acceleration"
weight = 9
url = "/features/gpu-acceleration/"
+++

View File

@@ -27,8 +27,7 @@ LocalAI provides a comprehensive set of features for running AI models locally.
- **[Realtime API](openai-realtime/)** - Low-latency multi-modal conversations (voice+text) over WebSocket
- **[Constrained Grammars](constrained_grammars/)** - Control model output format with BNF grammars
- **[GPU Acceleration](GPU-acceleration/)** - Optimize performance with GPU support
- **[Distributed Inference](distributed_inferencing/)** - Scale inference across multiple nodes
- **[Distributed Mode](distributed-mode/)** - Horizontal scaling with PostgreSQL, NATS, and remote backend nodes
- **[Distribution](distribution/)** - Scale inference across multiple nodes (P2P federation or production distributed mode)
- **[P2P API](p2p/)** - Monitor and manage P2P worker and federated nodes
- **[Model Context Protocol (MCP)](mcp/)** - Enable agentic capabilities with MCP integration
- **[Agents](agents/)** - Autonomous AI agents with tools, knowledge base, and skills

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "🤖 Agents"
title = "Agents"
weight = 21
url = '/features/agents'
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "🔈 Audio to text"
title = "Audio to Text"
weight = 16
url = "/features/audio-to-text/"
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "🔐 Authentication & Authorization"
title = "Authentication & Authorization"
weight = 26
url = '/features/authentication'
+++

View File

@@ -1,5 +1,5 @@
---
title: "⚙️ Backends"
title: "Backends"
description: "Learn how to use, manage, and develop backends in LocalAI"
weight: 4
url: "/backends/"

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "✍️ Constrained Grammars"
title = "Constrained Grammars"
weight = 15
url = "/features/constrained_grammars/"
+++

View File

@@ -5,7 +5,7 @@ weight = 14
url = "/features/distributed-mode/"
+++
Distributed mode enables horizontal scaling of LocalAI across multiple machines using **PostgreSQL** for state and node registry, and **NATS** for real-time coordination. Unlike the [P2P/federation approach](/features/distribute/), distributed mode is designed for production deployments and Kubernetes environments where you need centralized management, health monitoring, and deterministic routing.
Distributed mode enables horizontal scaling of LocalAI across multiple machines using **PostgreSQL** for state and node registry, and **NATS** for real-time coordination. Unlike the [P2P/federation approach]({{% relref "features/distributed_inferencing" %}}), distributed mode is designed for production deployments and Kubernetes environments where you need centralized management, health monitoring, and deterministic routing.
{{% notice note %}}
Distributed mode requires authentication enabled with a **PostgreSQL** database — SQLite is not supported. This is because the node registry, job store, and other distributed state are stored in PostgreSQL tables.

View File

@@ -1,12 +1,12 @@
+++
disableToc = false
title = "🆕🖧 Distributed Inference"
title = "P2P / Federated Inference"
weight = 15
url = "/features/distribute/"
+++
{{% notice tip %}}
Looking for production-grade horizontal scaling with PostgreSQL and NATS? See [Distributed Mode](/features/distributed-mode/).
Looking for production-grade horizontal scaling with PostgreSQL and NATS? See [Distributed Mode]({{% relref "features/distributed-mode" %}}).
{{% /notice %}}
This functionality enables LocalAI to distribute inference requests across multiple worker nodes, improving efficiency and performance. Nodes are automatically discovered and connect via p2p by using a shared token which makes sure the communication is secure and private between the nodes of the network.

View File

@@ -0,0 +1,34 @@
+++
disableToc = false
title = "Distribution"
weight = 13
url = "/features/distribution/"
+++
LocalAI supports distributing inference workloads across multiple machines. There are two approaches, each suited to different use cases:
## Distributed Mode (PostgreSQL + NATS)
Production-grade horizontal scaling with centralized management. Frontends are stateless LocalAI instances behind a load balancer; workers self-register and receive backends dynamically via NATS. State lives in PostgreSQL.
**Best for:** production deployments, Kubernetes, managed infrastructure.
[Read more]({{% relref "features/distributed-mode" %}})
## P2P / Federated Inference
Peer-to-peer networking via libp2p. Share a token to form a cluster with automatic discovery — no central server required. Supports federated load balancing and worker-mode weight sharding.
**Best for:** ad-hoc clusters, community sharing, quick experimentation.
[Read more]({{% relref "features/distributed_inferencing" %}})
## Quick Comparison
| | P2P / Federation | Distributed Mode |
|---|---|---|
| **Discovery** | Automatic via libp2p token | Self-registration to frontend URL |
| **State storage** | In-memory / ledger | PostgreSQL |
| **Coordination** | Gossip protocol | NATS messaging |
| **Node management** | Automatic | REST API + WebUI |
| **Setup complexity** | Minimal (share a token) | Requires PostgreSQL + NATS |

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "🧠 Embeddings"
title = "Embeddings"
weight = 13
url = "/features/embeddings/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "🥽 GPT Vision"
title = "GPT Vision"
weight = 14
url = "/features/gpt-vision/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "🎨 Image generation"
title = "Image Generation"
weight = 12
url = "/features/image-generation/"
+++

View File

@@ -1,5 +1,5 @@
+++
title = "🔗 Model Context Protocol (MCP)"
title = "Model Context Protocol (MCP)"
weight = 20
toc = true
description = "Agentic capabilities with Model Context Protocol integration"

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "🖼️ Model gallery"
title = "Model Gallery"
weight = 18
url = '/models'
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "🔍 Object detection"
title = "Object Detection"
weight = 13
url = "/features/object-detection/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "🔥 OpenAI functions and tools"
title = "OpenAI Functions and Tools"
weight = 17
url = "/features/openai-functions/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "📈 Reranker"
title = "Reranker"
weight = 11
url = "/features/reranker/"
+++

View File

@@ -1,6 +1,6 @@
+++
disableToc = false
title = "⚙️ Runtime Settings"
title = "Runtime Settings"
weight = 25
url = '/features/runtime-settings'
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "💾 Stores"
title = "Stores"
weight = 18
url = '/stores'
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "📖 Text generation (GPT)"
title = "Text Generation (GPT)"
weight = 10
url = "/features/text-generation/"
+++

View File

@@ -1,7 +1,7 @@
+++
disableToc = false
title = "🗣 Text to audio (TTS)"
title = "Text to Audio (TTS)"
weight = 11
url = "/features/text-to-audio/"
+++

View File

@@ -119,7 +119,7 @@ For production deployments or when you need more compute, LocalAI supports distr
- **P2P federation**: Connect multiple LocalAI instances for load-balanced inference
- **Model sharding**: Split large models across multiple machines
See the **Nodes** page in the web interface or the [Distribution docs]({{% relref "features/distribute" %}}) for setup instructions.
See the **Nodes** page in the web interface or the [Distribution docs]({{% relref "features/distribution" %}}) for setup instructions.
## What's Next?

View File

@@ -72,9 +72,10 @@ Feel free to open up a Pull request (by clicking at the "Edit page" below) to ge
### Home Automation
- [hass-openai-custom-conversation](https://github.com/drndos/hass-openai-custom-conversation) — Home Assistant integration
- [ha-llmvision](https://github.com/valentinfrlch/ha-llmvision) — Home Assistant LLM Vision
- [HA-LocalAI-Monitor](https://github.com/loryanstrant/HA-LocalAI-Monitor) — Home Assistant monitoring
- [Extended OpenAI Conversation](https://github.com/jekalmin/extended_openai_conversation) — Conversation agent for Home Assistant that supports a custom OpenAI endpoint
- [LLM Vision](https://github.com/valentinfrlch/ha-llmvision) — Image & video feed analysis for Home Assistant
- [OpenAI TTS Speech Service](https://github.com/sfortis/openai_tts) - OpenAI TTS custom component for Home Assistant
- [LocalAI Monitor](https://github.com/loryanstrant/HA-LocalAI-Monitor) — Monitor & control of LocalAI from Home Assistant
- Nextcloud [integration plugin](https://apps.nextcloud.com/apps/integration_openai) and [AI assistant](https://apps.nextcloud.com/apps/assistant)
### Automation & DevOps

View File

@@ -1,4 +1,38 @@
---
- name: "qwen3.5-35b-a3b-apex"
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:
- https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF
description: |
Describe the model in a clear and concise way that can be shared in a model gallery.
overrides:
backend: llama-cpp
function:
automatic_tool_parsing_fallback: true
grammar:
disable: true
known_usecases:
- chat
mmproj: llama-cpp/mmproj/Qwen3.5-35B-A3B-APEX-GGUF/mmproj-F16.gguf
options:
- use_jinja:true
parameters:
min_p: 0
model: llama-cpp/models/Qwen3.5-35B-A3B-APEX-GGUF/Qwen3.5-35B-A3B-APEX-Quality.gguf
presence_penalty: 1.5
repeat_penalty: 1
temperature: 0.7
top_k: 20
top_p: 0.8
template:
use_tokenizer_template: true
files:
- filename: llama-cpp/mmproj/Qwen3.5-35B-A3B-APEX-GGUF/mmproj-F16.gguf
sha256: a516ab92e8240da4734d68352bdfba84c16e830ee40010b8fac80d69c77272ff
uri: https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF/resolve/main/mmproj-F16.gguf
- filename: llama-cpp/models/Qwen3.5-35B-A3B-APEX-GGUF/Qwen3.5-35B-A3B-APEX-Quality.gguf
sha256: 50887b60c77ee5c95bc3657814ae993abcab7b2d71868b9af1e84d6badd09a57
uri: https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF/resolve/main/Qwen3.5-35B-A3B-APEX-Quality.gguf
- name: "qwen_qwen3.5-35b-a3b"
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:

View File

@@ -1,151 +0,0 @@
#!/bin/bash
# apply_patches.sh — Generic patch fetcher and applier for any backend.
#
# Usage: ./apply_patches.sh <source-dir> <target-dir>
#
# <source-dir> Directory containing a patches/ folder (with optional sources.yaml)
# <target-dir> The cloned upstream repo to patch (e.g., llama.cpp/)
#
# Behavior (idempotent):
# 1. If patches/sources.yaml exists and yq is available, for each source:
# - If patches/<name>/ already has .patch files: skip fetching (vendored)
# - Otherwise: clone the fork at a pinned SHA, diff against the pinned
# upstream SHA, and generate patches
# 2. Apply all patches (skips already-applied ones)
# 3. Fails fast on any patch application error
#
# sources.yaml fields:
# name — subdirectory name for this source's patches
# repo — fork git URL
# version_var — Makefile variable holding the pinned fork commit SHA
# base_var — Makefile variable holding the pinned upstream commit SHA
# version_file — Makefile path (relative to backend dir)
set -e
# Use /tmp for patch temp files to avoid macOS long-path issues
export TMPDIR="${TMPDIR_OVERRIDE:-/tmp}"
read_makefile_var() {
grep -m1 "^${1}?=" "$2" | cut -d'=' -f2
}
apply_one_patch() {
local target_dir="$1"
local patch_file="$2"
local label="$3"
if patch -d "$target_dir" -p1 --reverse --dry-run < "$patch_file" >/dev/null 2>&1; then
echo " Already applied, skipping: $label"
return 0
fi
echo " Applying: $label"
patch -d "$target_dir" -p1 --forward < "$patch_file" || { echo "FAILED: $patch_file"; exit 1; }
}
apply_patches() {
local SOURCE_DIR="$(cd "$1" && pwd)"
local TARGET_DIR="$2"
local PATCHES_DIR="$SOURCE_DIR/patches"
if [ ! -d "$PATCHES_DIR" ]; then
return 0
fi
# Phase 1: Generate missing patches from fork sources
if [ -f "$PATCHES_DIR/sources.yaml" ] && command -v yq &>/dev/null; then
local SOURCE_COUNT
SOURCE_COUNT=$(yq '.sources | length' "$PATCHES_DIR/sources.yaml")
for i in $(seq 0 $((SOURCE_COUNT - 1))); do
local NAME REPO VERSION_VAR BASE_VAR VERSION_FILE
NAME=$(yq ".sources[$i].name" "$PATCHES_DIR/sources.yaml")
REPO=$(yq ".sources[$i].repo" "$PATCHES_DIR/sources.yaml")
VERSION_VAR=$(yq ".sources[$i].version_var" "$PATCHES_DIR/sources.yaml")
BASE_VAR=$(yq ".sources[$i].base_var" "$PATCHES_DIR/sources.yaml")
VERSION_FILE=$(yq ".sources[$i].version_file" "$PATCHES_DIR/sources.yaml")
local MAKEFILE="$SOURCE_DIR/$VERSION_FILE"
local FORK_SHA BASE_SHA
FORK_SHA=$(read_makefile_var "$VERSION_VAR" "$MAKEFILE")
BASE_SHA=$(read_makefile_var "$BASE_VAR" "$MAKEFILE")
if [ -z "$FORK_SHA" ] || [ -z "$BASE_SHA" ]; then
echo "WARNING: Could not read $VERSION_VAR or $BASE_VAR from $MAKEFILE — skipping '$NAME'"
continue
fi
local SOURCE_PATCH_DIR="$PATCHES_DIR/$NAME"
local EXISTING
EXISTING=$(ls "$SOURCE_PATCH_DIR"/*.patch 2>/dev/null | wc -l)
if [ "$EXISTING" -gt 0 ]; then
echo "Patches [$NAME]: $EXISTING patches already present — skipping fetch."
else
echo "Patches [$NAME]: generating from $REPO"
echo " base (upstream): ${BASE_SHA:0:12}"
echo " head (fork): ${FORK_SHA:0:12}"
local TMPDIR_CLONE
TMPDIR_CLONE=$(mktemp -d)
if git clone "$REPO" "$TMPDIR_CLONE/fork" 2>&1; then
cd "$TMPDIR_CLONE/fork"
# Fetch the upstream base commit (may not be in the fork's history)
git fetch origin "$FORK_SHA" 2>&1 || true
git checkout "$FORK_SHA" 2>&1
# We need the base commit in the history to compute the diff.
# If the fork is a real GitHub fork, it shares history with upstream.
# Otherwise, fetch it explicitly.
if ! git cat-file -e "$BASE_SHA" 2>/dev/null; then
echo " Base commit not in fork history — fetching from upstream"
local UPSTREAM_URL
# Derive upstream URL from base_var context or use llama.cpp default
UPSTREAM_URL=$(yq ".sources[$i].upstream_repo // \"\"" "$PATCHES_DIR/sources.yaml")
if [ -n "$UPSTREAM_URL" ] && [ "$UPSTREAM_URL" != "null" ]; then
git remote add upstream "$UPSTREAM_URL" 2>/dev/null || true
git fetch upstream 2>&1
fi
fi
local PATCH_COUNT
PATCH_COUNT=$(git rev-list --count "$BASE_SHA".."$FORK_SHA" 2>/dev/null || echo "0")
echo " $PATCH_COUNT commits in diff"
if [ "$PATCH_COUNT" -gt 0 ]; then
mkdir -p "$SOURCE_PATCH_DIR"
git format-patch "$BASE_SHA".."$FORK_SHA" -o "$SOURCE_PATCH_DIR/" >/dev/null 2>&1
echo " Generated $PATCH_COUNT patches in patches/$NAME/"
fi
cd "$SOURCE_DIR"
else
echo "WARNING: Failed to clone $REPO — skipping source '$NAME'"
fi
rm -rf "$TMPDIR_CLONE"
fi
done
elif [ -f "$PATCHES_DIR/sources.yaml" ]; then
echo "WARNING: yq not found — skipping source-based patch generation."
fi
# Phase 2: Apply patches (subdirectories first, then top-level)
for source_dir in $(find "$PATCHES_DIR" -mindepth 1 -maxdepth 1 -type d | sort); do
for p in $(ls "$source_dir"/*.patch 2>/dev/null | sort); do
apply_one_patch "$TARGET_DIR" "$p" "$(basename "$source_dir")/$(basename "$p")"
done
done
for p in $(ls "$PATCHES_DIR"/*.patch 2>/dev/null | sort); do
apply_one_patch "$TARGET_DIR" "$p" "$(basename "$p")"
done
}
# Run with arguments
if [ $# -lt 2 ]; then
echo "Usage: $0 <source-dir> <target-dir>"
exit 1
fi
apply_patches "$1" "$2"