feat(turboquant.cpp): add new backend

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Update asset links in README.md
2026-07-05 22:09:02 -04:00 · 2026-04-03 20:57:15 +00:00 · 2026-04-03 10:24:08 +02:00 · 2026-04-03 10:23:03 +02:00 · 2026-04-03 10:14:13 +02:00 · 2026-04-03 09:46:06 +02:00
45 changed files with 436 additions and 270 deletions
--- a/.github/gallery-agent/agent.go
+++ b/.github/gallery-agent/agent.go
@@ -133,6 +133,7 @@ func getRealReadme(ctx context.Context, repository string) (string, error) {
 	result, err := cogito.ExecuteTools(llm, fragment,
 		cogito.WithIterations(3),
 		cogito.WithMaxAttempts(3),
+		cogito.DisableSinkState,
 		cogito.WithTools(&HFReadmeTool{client: hfapi.NewClient()}))
 	if err != nil {
 		return "", err
--- a/.github/gallery-agent/gallery.go
+++ b/.github/gallery-agent/gallery.go
@@ -79,7 +79,20 @@ func generateYAMLEntry(model ProcessedModel, quantization string) string {
 	description = cleanTextContent(description)
 	formattedDescription := formatTextContent(description)

-	configFile := formatTextContent(modelConfig.ConfigFile)
+	// Strip name and description from config file since they are
+	// already present at the gallery entry level and should not
+	// appear under overrides.
+	configFileContent := modelConfig.ConfigFile
+	var cfgMap map[string]any
+	if err := yaml.Unmarshal([]byte(configFileContent), &cfgMap); err == nil {
+		delete(cfgMap, "name")
+		delete(cfgMap, "description")
+		if cleaned, err := yaml.Marshal(cfgMap); err == nil {
+			configFileContent = string(cleaned)
+		}
+	}
+
+	configFile := formatTextContent(configFileContent)

 	filesYAML, _ := yaml.Marshal(modelConfig.Files)

--- a/.github/gallery-agent/testing.go
+++ b/.github/gallery-agent/testing.go
@@ -17,7 +17,7 @@ func runSyntheticMode() error {
 	fmt.Printf("Generating %d synthetic models for testing...\n", numModels)

 	var models []ProcessedModel
-	for i := range numModels {
+	for range numModels {
 		model := generator.GenerateProcessedModel()
 		models = append(models, model)
 		fmt.Printf("Generated synthetic model: %s\n", model.ModelID)
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -1828,6 +1828,98 @@ jobs:
            dockerfile: "./backend/Dockerfile.llama-cpp"
            context: "./"
            ubuntu-version: '2404'
+          # llama-cpp-tq (TurboQuant fork)
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-cpu-llama-cpp-tq'
+            runs-on: 'bigger-runner'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "llama-cpp-tq"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "8"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp-tq'
+            runs-on: 'bigger-runner'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "llama-cpp-tq"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp-tq'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "llama-cpp-tq"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            skip-drivers: 'false'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-cuda-13-arm64-llama-cpp-tq'
+            base-image: "ubuntu:24.04"
+            runs-on: 'ubuntu-24.04-arm'
+            ubuntu-version: '2404'
+            backend: "llama-cpp-tq"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-rocm-hipblas-llama-cpp-tq'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-24.04:6.4.4"
+            skip-drivers: 'false'
+            backend: "llama-cpp-tq"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            skip-drivers: 'false'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-arm64-llama-cpp-tq'
+            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+            runs-on: 'ubuntu-24.04-arm'
+            backend: "llama-cpp-tq"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
+            ubuntu-version: '2204'
+          - build-type: 'vulkan'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-vulkan-llama-cpp-tq'
+            runs-on: 'bigger-runner'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "llama-cpp-tq"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
+            ubuntu-version: '2404'
          # Stablediffusion-ggml
          - build-type: ''
            cuda-major-version: ""
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -15,9 +15,10 @@ jobs:
            branch: "master"
            file: "backend/cpp/llama-cpp/Makefile"
          - repository: "TheTom/llama-cpp-turboquant"
-            variable: "TURBOQUANT_VERSION"
-            branch: "feature/turboquant-kv-cache"
-            file: "backend/cpp/llama-cpp/Makefile"
+            variable: "LLAMA_VERSION"
+            branch: "master"
+            file: "backend/cpp/llama-cpp-tq/Makefile"
+            branch_suffix: "-tq"
          - repository: "ggml-org/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
@@ -64,6 +65,9 @@ jobs:
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update ${{ matrix.repository }}'
          title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
-          branch: "update/${{ matrix.variable }}"
+          branch: "update/${{ matrix.variable }}${{ matrix.branch_suffix }}"
          body: ${{ steps.bump.outputs.message }}
          signoff: true
+
+
+
--- a/.github/workflows/gallery-agent.yaml
+++ b/.github/workflows/gallery-agent.yaml
@@ -55,7 +55,7 @@ jobs:
      - name: Run gallery agent
        env:
          #OPENAI_MODEL: ${{ secrets.OPENAI_MODEL }}
-          OPENAI_MODE: Qwen3.5-2B-GGUF
+          OPENAI_MODEL: Qwen3.5-2B-GGUF
          OPENAI_BASE_URL: "http://localhost:8080"
          OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
          #OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ prepare-sources
 /backend/cpp/llama-cpp/llama.cpp
 /backend/cpp/llama-*
 !backend/cpp/llama-cpp
+!backend/cpp/llama-cpp-tq
 /backends
 /backend-images
 /result.yaml
--- a/4
+++ b/4
@@ -544,8 +544,9 @@ backend-images:
 	mkdir -p backend-images

 # Backend metadata: BACKEND_NAME | DOCKERFILE_TYPE | BUILD_CONTEXT | PROGRESS_FLAG | NEEDS_BACKEND_ARG
-# llama-cpp is special - uses llama-cpp Dockerfile and doesn't need BACKEND arg
+# llama-cpp and forks - use llama-cpp Dockerfile
 BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false
+BACKEND_LLAMA_CPP_TQ = llama-cpp-tq|llama-cpp|.|false|true

 # Golang backends
 BACKEND_PIPER = piper|golang|.|false|true
@@ -609,6 +610,7 @@ endef

 # Generate all docker-build targets
 $(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
+$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_TQ)))
 $(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
 $(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
--- a/README.md
+++ b/README.md
@@ -42,16 +42,38 @@ Created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

 > [:book: Documentation](https://localai.io/) | [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) | [💻 Quickstart](https://localai.io/basics/getting_started/) | [🖼️ Models](https://models.localai.io/) | [❓FAQ](https://localai.io/faq/)

-## Screenshots
-
-### Chat, Model gallery
+## Guided tour

 https://github.com/user-attachments/assets/08cbb692-57da-48f7-963d-2e7b43883c18

-### Agents
+<details>
+
+<summary>
+Click to see more!
+</summary>
+
+#### User and auth
+
+https://github.com/user-attachments/assets/228fa9ad-81a3-4d43-bfb9-31557e14a36c
+
+#### Agents

 https://github.com/user-attachments/assets/6270b331-e21d-4087-a540-6290006b381a

+#### Usage metrics per user
+
+https://github.com/user-attachments/assets/cbb03379-23b4-4e3d-bd26-d152f057007f
+
+#### Fine-tuning and Quantization
+
+https://github.com/user-attachments/assets/5ba4ace9-d3df-4795-b7d4-b0b404ea71ee
+
+#### WebRTC
+
+https://github.com/user-attachments/assets/ed88e34c-fed3-4b83-8a67-4716a9feeb7b
+
+</details>
+
 ## Quickstart

 ### macOS
--- a/backend/Dockerfile.llama-cpp
+++ b/backend/Dockerfile.llama-cpp
@@ -58,7 +58,9 @@ ARG CUDA_DOCKER_ARCH
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 ARG CMAKE_ARGS
 ENV CMAKE_ARGS=${CMAKE_ARGS}
-ARG BACKEND=rerankers
+ARG BACKEND=llama-cpp
+ARG LLAMA_BACKEND_DIR=${BACKEND}
+ENV LLAMA_BACKEND_DIR=${LLAMA_BACKEND_DIR}
 ARG BUILD_TYPE
 ENV BUILD_TYPE=${BUILD_TYPE}
 ARG CUDA_MAJOR_VERSION
@@ -255,32 +257,27 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
  export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
-  rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
+  rm -rf /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}-*-build
 fi

+cd /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}
+
 if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
-  cd /LocalAI/backend/cpp/llama-cpp
-  make llama-cpp-fallback
-  make llama-cpp-grpc
-  make llama-cpp-rpc-server
+  make ARCH=aarch64 build-variants
 else
-  cd /LocalAI/backend/cpp/llama-cpp
-  make llama-cpp-avx
-  make llama-cpp-avx2
-  make llama-cpp-avx512
-  make llama-cpp-fallback
-  make llama-cpp-grpc
-  make llama-cpp-rpc-server
+  make build-variants
 fi
 EOT


 # Copy libraries using a script to handle architecture differences
-RUN make -BC /LocalAI/backend/cpp/llama-cpp package
+RUN make -BC /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR} package


 FROM scratch

+ARG BACKEND=llama-cpp
+ARG LLAMA_BACKEND_DIR=${BACKEND}

 # Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
-COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./
+COPY --from=builder /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}/package/. ./
--- a/backend/cpp/llama-cpp-tq/Makefile
+++ b/backend/cpp/llama-cpp-tq/Makefile
@@ -0,0 +1,6 @@
+LLAMA_VERSION?=master
+LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant
+BACKEND_NAME?=llama-cpp-tq
+SHARED_DIR?=$(CURDIR)/../llama-cpp
+
+include ../llama-cpp/Makefile
--- a/backend/cpp/llama-cpp/CMakeLists.txt
+++ b/backend/cpp/llama-cpp/CMakeLists.txt
@@ -59,6 +59,11 @@ add_library(hw_grpc_proto

 add_executable(${TARGET} grpc-server.cpp json.hpp httplib.h)

+# Enable autoparser support if the header exists (not present in all llama.cpp forks)
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/chat-auto-parser.h")
+    target_compile_definitions(${TARGET} PRIVATE HAS_AUTOPARSER)
+endif()
+
 target_include_directories(${TARGET} PRIVATE ../llava)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})

--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,8 +1,10 @@

-LLAMA_VERSION?=0fcb3760b2b9a3a496ef14621a7e4dad7a8df90f
+LLAMA_VERSION?=a1cfb645307edc61a89e41557f290f441043d3c2
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
-
-TURBOQUANT_VERSION?=8ad0f00e9a38df6c29fc10363341dde300f92ae4
+BACKEND_NAME?=llama-cpp
+SHARED_DIR?=$(CURDIR)
+GRPC_SERVER_DIR?=tools/grpc-server
+SERVER_SOURCE_DIR?=tools/server

 CMAKE_ARGS?=
 BUILD_TYPE?=
@@ -69,6 +71,17 @@ ifeq ($(BUILD_TYPE),sycl_f32)
 		-DCMAKE_CXX_FLAGS="-fsycl"
 endif

+# Variants to build for each architecture (can be overridden by forks)
+X86_64_VARIANTS ?= llama-cpp-avx llama-cpp-avx2 llama-cpp-avx512 llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
+ARM64_VARIANTS ?= llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
+
+build-variants:
+ifeq ($(ARCH),aarch64)
+	@for v in $(ARM64_VARIANTS); do $(MAKE) $$v || exit 1; done
+else
+	@for v in $(X86_64_VARIANTS); do $(MAKE) $$v || exit 1; done
+endif
+
 INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
 INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
 ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
@@ -92,42 +105,42 @@ else
 endif

 llama-cpp-avx2: llama.cpp
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
-	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build purge
 	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
-	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx2-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build/grpc-server llama-cpp-avx2

 llama-cpp-avx512: llama.cpp
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
-	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build purge
 	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
-	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx512-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build/grpc-server llama-cpp-avx512

 llama-cpp-avx: llama.cpp
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
-	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build purge
 	$(info ${GREEN}I llama-cpp build info:avx${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
-	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-avx-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build/grpc-server llama-cpp-avx

 llama-cpp-fallback: llama.cpp
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
-	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build purge
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
-	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-fallback-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build/grpc-server llama-cpp-fallback

 llama-cpp-grpc: llama.cpp
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
-	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build purge
 	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
-	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="$(BACKEND_NAME)-grpc-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/grpc-server llama-cpp-grpc

 llama-cpp-rpc-server: llama-cpp-grpc
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server

 llama.cpp:
 	mkdir -p llama.cpp
@@ -135,30 +148,30 @@ llama.cpp:
 	git init && \
 	git remote add origin $(LLAMA_REPO)  && \
 	git fetch origin && \
-	git checkout -b build $(LLAMA_VERSION) && \
+	(git checkout -b build $(LLAMA_VERSION) || git checkout -b build origin/$(LLAMA_VERSION)) && \
 	git submodule update --init --recursive --depth 1 --single-branch

-llama.cpp/tools/grpc-server: llama.cpp
-	mkdir -p llama.cpp/tools/grpc-server
-	bash prepare.sh
+llama.cpp/$(GRPC_SERVER_DIR): llama.cpp
+	mkdir -p llama.cpp/$(GRPC_SERVER_DIR)
+	SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh

 rebuild:
-	bash prepare.sh
+	SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
 	rm -rf grpc-server
 	$(MAKE) grpc-server

 package:
-	bash package.sh
+	bash $(SHARED_DIR)/package.sh

 purge:
 	rm -rf llama.cpp/build
-	rm -rf llama.cpp/tools/grpc-server
+	rm -rf llama.cpp/$(GRPC_SERVER_DIR)
 	rm -rf grpc-server

 clean: purge
 	rm -rf llama.cpp

-grpc-server: llama.cpp llama.cpp/tools/grpc-server
+grpc-server: llama.cpp llama.cpp/$(GRPC_SERVER_DIR)
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -17,7 +17,9 @@
 #include "backend.pb.h"
 #include "backend.grpc.pb.h"
 #include "common.h"
+#ifdef HAS_AUTOPARSER
 #include "chat-auto-parser.h"
+#endif
 #include <getopt.h>
 #include <grpcpp/ext/proto_server_reflection_plugin.h>
 #include <grpcpp/grpcpp.h>
@@ -2665,6 +2667,7 @@ public:
        
        response->set_rendered_template(rendered_template);

+#ifdef HAS_AUTOPARSER
        // Run differential template analysis to detect tool format markers
        if (params_base.use_jinja) {
            try {
@@ -2770,6 +2773,7 @@ public:
                SRV_WRN("ModelMetadata: failed to run autoparser analysis: %s\n", e.what());
            }
        }
+#endif

        return grpc::Status::OK;
    }
--- a/backend/cpp/llama-cpp/package.sh
+++ b/backend/cpp/llama-cpp/package.sh
@@ -5,14 +5,21 @@

 set -e

-CURDIR=$(dirname "$(realpath $0)")
-REPO_ROOT="${CURDIR}/../../.."
+# Use working directory (not script location) so forks that share this script work correctly
+CURDIR=$(pwd)
+SCRIPT_DIR=$(dirname "$(realpath $0)")
+REPO_ROOT="${SCRIPT_DIR}/../../.."

 # Create lib directory
 mkdir -p $CURDIR/package/lib

 cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
-cp -rfv $CURDIR/run.sh $CURDIR/package/
+# Copy run.sh — prefer local copy, fall back to shared dir (script location)
+if [ -f "$CURDIR/run.sh" ]; then
+    cp -rfv $CURDIR/run.sh $CURDIR/package/
+else
+    cp -rfv $SCRIPT_DIR/run.sh $CURDIR/package/
+fi

 # Detect architecture and copy appropriate libraries
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
--- a/backend/cpp/llama-cpp/patches/sources.yaml
+++ b/backend/cpp/llama-cpp/patches/sources.yaml
@@ -1,14 +0,0 @@
-# Patch sources for the llama-cpp backend.
-# Each source declares a fork whose commits are extracted as patches
-# and applied on top of upstream llama.cpp during the build.
-# See scripts/patch_utils/apply_patches.sh for the generic patch engine.
-#
-# version_var: Makefile variable with the pinned fork commit SHA
-# base_var:    Makefile variable with the upstream base commit SHA
-# Both are read from version_file (relative to backend dir) to compute the diff.
-sources:
-  - name: turboquant
-    repo: https://github.com/TheTom/llama-cpp-turboquant.git
-    version_var: TURBOQUANT_VERSION
-    base_var: LLAMA_VERSION
-    version_file: Makefile
--- a/backend/cpp/llama-cpp/prepare.sh
+++ b/backend/cpp/llama-cpp/prepare.sh
@@ -1,26 +1,43 @@
 #!/bin/bash
+
+SHARED_DIR="${SHARED_DIR:-.}"
+SERVER_SOURCE_DIR="${SERVER_SOURCE_DIR:-tools/server}"
+GRPC_SERVER_DIR="${GRPC_SERVER_DIR:-tools/grpc-server}"
+
+## Apply patches from the `patches` directory
+if [ -d "patches" ]; then
+    for patch in $(ls patches); do
+        echo "Applying patch $patch"
+        patch -d llama.cpp/ -p1 < patches/$patch
+    done
+fi
+
 set -e

-SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-REPO_ROOT="$SCRIPT_DIR/../../.."
-
-## Apply patches from sources and/or local .patch files
-"$REPO_ROOT/scripts/patch_utils/apply_patches.sh" "$SCRIPT_DIR" llama.cpp
-
-## Copy server files into grpc-server build directory
-for file in $(ls llama.cpp/tools/server/); do
-    cp -rfv llama.cpp/tools/server/$file llama.cpp/tools/grpc-server/
+# Copy server source files into grpc-server build directory
+for file in $(ls llama.cpp/${SERVER_SOURCE_DIR}/); do
+    cp -rfv llama.cpp/${SERVER_SOURCE_DIR}/$file llama.cpp/${GRPC_SERVER_DIR}/
 done

-cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
-cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
-cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
-cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/
+# Copy build files — prefer local overrides, fall back to SHARED_DIR
+for f in CMakeLists.txt grpc-server.cpp; do
+    if [ -f "$f" ]; then
+        cp -r "$f" llama.cpp/${GRPC_SERVER_DIR}/
+    else
+        cp -r "$SHARED_DIR/$f" llama.cpp/${GRPC_SERVER_DIR}/
+    fi
+done
+
+cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/${GRPC_SERVER_DIR}/
+cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/${GRPC_SERVER_DIR}/
+
+# Add grpc-server subdirectory to the parent CMakeLists.txt
+PARENT_CMAKELISTS="llama.cpp/$(dirname ${GRPC_SERVER_DIR})/CMakeLists.txt"

 set +e
-if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
+if grep -q "grpc-server" "$PARENT_CMAKELISTS"; then
    echo "grpc-server already added"
 else
-    echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
+    echo "add_subdirectory(grpc-server)" >> "$PARENT_CMAKELISTS"
 fi
 set -e
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=09b12d5f6d51d862749e8e0ee8baac8f012089e2
+STABLEDIFFUSION_GGML_VERSION?=87ecb95cbc65dc8e58e3d88f4f4a59a0939796f5

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -29,6 +29,34 @@
    nvidia-cuda-12: "cuda12-llama-cpp"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp"
+- &llamacpp_tq
+  name: "llama-cpp-tq"
+  alias: "llama-cpp-tq"
+  license: mit
+  description: |
+    TurboQuant llama.cpp fork - quantization research
+  urls:
+    - https://github.com/TheTom/llama-cpp-turboquant
+  tags:
+    - text-to-text
+    - LLM
+    - CPU
+    - GPU
+    - Metal
+    - CUDA
+    - HIP
+  capabilities:
+    default: "cpu-llama-cpp-tq"
+    nvidia: "cuda12-llama-cpp-tq"
+    intel: "intel-sycl-f16-llama-cpp-tq"
+    amd: "rocm-llama-cpp-tq"
+    metal: "metal-llama-cpp-tq"
+    vulkan: "vulkan-llama-cpp-tq"
+    nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-tq"
+    nvidia-cuda-13: "cuda13-llama-cpp-tq"
+    nvidia-cuda-12: "cuda12-llama-cpp-tq"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-tq"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
 - &whispercpp
  name: "whisper"
  alias: "whisper"
@@ -1252,6 +1280,57 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp
+# llama-cpp-tq (TurboQuant) concrete backends
+- !!merge <<: *llamacpp_tq
+  name: "cpu-llama-cpp-tq"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp-tq"
+  mirrors:
+    - localai/localai-backends:latest-cpu-llama-cpp-tq
+- !!merge <<: *llamacpp_tq
+  name: "cuda12-llama-cpp-tq"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq
+- !!merge <<: *llamacpp_tq
+  name: "cuda13-llama-cpp-tq"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq
+- !!merge <<: *llamacpp_tq
+  name: "rocm-llama-cpp-tq"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq"
+  mirrors:
+    - localai/localai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq
+- !!merge <<: *llamacpp_tq
+  name: "intel-sycl-f16-llama-cpp-tq"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq
+- !!merge <<: *llamacpp_tq
+  name: "intel-sycl-f32-llama-cpp-tq"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq
+- !!merge <<: *llamacpp_tq
+  name: "vulkan-llama-cpp-tq"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-llama-cpp-tq"
+  mirrors:
+    - localai/localai-backends:latest-gpu-vulkan-llama-cpp-tq
+- !!merge <<: *llamacpp_tq
+  name: "metal-llama-cpp-tq"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp-tq"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-llama-cpp-tq
+- !!merge <<: *llamacpp_tq
+  name: "nvidia-l4t-arm64-llama-cpp-tq"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq
+- !!merge <<: *llamacpp_tq
+  name: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq
 ## whisper
 - !!merge <<: *whispercpp
  name: "nvidia-l4t-arm64-whisper"
--- a/docs/content/features/GPU-acceleration.md
+++ b/docs/content/features/GPU-acceleration.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "⚡ GPU acceleration"
+title = "GPU Acceleration"
 weight = 9
 url = "/features/gpu-acceleration/"
 +++
--- a/docs/content/features/_index.en.md
+++ b/docs/content/features/_index.en.md
@@ -27,8 +27,7 @@ LocalAI provides a comprehensive set of features for running AI models locally.
 - **[Realtime API](openai-realtime/)** - Low-latency multi-modal conversations (voice+text) over WebSocket
 - **[Constrained Grammars](constrained_grammars/)** - Control model output format with BNF grammars
 - **[GPU Acceleration](GPU-acceleration/)** - Optimize performance with GPU support
- **[Distributed Inference](distributed_inferencing/)** - Scale inference across multiple nodes
- **[Distributed Mode](distributed-mode/)** - Horizontal scaling with PostgreSQL, NATS, and remote backend nodes
+- **[Distribution](distribution/)** - Scale inference across multiple nodes (P2P federation or production distributed mode)
 - **[P2P API](p2p/)** - Monitor and manage P2P worker and federated nodes
 - **[Model Context Protocol (MCP)](mcp/)** - Enable agentic capabilities with MCP integration
 - **[Agents](agents/)** - Autonomous AI agents with tools, knowledge base, and skills
--- a/docs/content/features/agents.md
+++ b/docs/content/features/agents.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "🤖 Agents"
+title = "Agents"
 weight = 21
 url = '/features/agents'
 +++
--- a/docs/content/features/audio-to-text.md
+++ b/docs/content/features/audio-to-text.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "🔈 Audio to text"
+title = "Audio to Text"
 weight = 16
 url = "/features/audio-to-text/"
 +++
--- a/docs/content/features/authentication.md
+++ b/docs/content/features/authentication.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "🔐 Authentication & Authorization"
+title = "Authentication & Authorization"
 weight = 26
 url = '/features/authentication'
 +++
--- a/docs/content/features/backends.md
+++ b/docs/content/features/backends.md
@@ -1,5 +1,5 @@
 ---
-title: "⚙️ Backends"
+title: "Backends"
 description: "Learn how to use, manage, and develop backends in LocalAI"
 weight: 4
 url: "/backends/"
--- a/docs/content/features/constrained_grammars.md
+++ b/docs/content/features/constrained_grammars.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "✍️ Constrained Grammars"
+title = "Constrained Grammars"
 weight = 15
 url = "/features/constrained_grammars/"
 +++
--- a/docs/content/features/distributed-mode.md
+++ b/docs/content/features/distributed-mode.md
@@ -5,7 +5,7 @@ weight = 14
 url = "/features/distributed-mode/"
 +++

-Distributed mode enables horizontal scaling of LocalAI across multiple machines using **PostgreSQL** for state and node registry, and **NATS** for real-time coordination. Unlike the [P2P/federation approach](/features/distribute/), distributed mode is designed for production deployments and Kubernetes environments where you need centralized management, health monitoring, and deterministic routing.
+Distributed mode enables horizontal scaling of LocalAI across multiple machines using **PostgreSQL** for state and node registry, and **NATS** for real-time coordination. Unlike the [P2P/federation approach]({{% relref "features/distributed_inferencing" %}}), distributed mode is designed for production deployments and Kubernetes environments where you need centralized management, health monitoring, and deterministic routing.

 {{% notice note %}}
 Distributed mode requires authentication enabled with a **PostgreSQL** database — SQLite is not supported. This is because the node registry, job store, and other distributed state are stored in PostgreSQL tables.
--- a/docs/content/features/distributed_inferencing.md
+++ b/docs/content/features/distributed_inferencing.md
@@ -1,12 +1,12 @@
 +++
 disableToc = false
-title = "🆕🖧 Distributed Inference"
+title = "P2P / Federated Inference"
 weight = 15
 url = "/features/distribute/"
 +++

 {{% notice tip %}}
-Looking for production-grade horizontal scaling with PostgreSQL and NATS? See [Distributed Mode](/features/distributed-mode/).
+Looking for production-grade horizontal scaling with PostgreSQL and NATS? See [Distributed Mode]({{% relref "features/distributed-mode" %}}).
 {{% /notice %}}

 This functionality enables LocalAI to distribute inference requests across multiple worker nodes, improving efficiency and performance. Nodes are automatically discovered and connect via p2p by using a shared token which makes sure the communication is secure and private between the nodes of the network.
--- a/docs/content/features/distribution.md
+++ b/docs/content/features/distribution.md
@@ -0,0 +1,34 @@
+++
+disableToc = false
+title = "Distribution"
+weight = 13
+url = "/features/distribution/"
+++
+
+LocalAI supports distributing inference workloads across multiple machines. There are two approaches, each suited to different use cases:
+
+## Distributed Mode (PostgreSQL + NATS)
+
+Production-grade horizontal scaling with centralized management. Frontends are stateless LocalAI instances behind a load balancer; workers self-register and receive backends dynamically via NATS. State lives in PostgreSQL.
+
+**Best for:** production deployments, Kubernetes, managed infrastructure.
+
+[Read more]({{% relref "features/distributed-mode" %}})
+
+## P2P / Federated Inference
+
+Peer-to-peer networking via libp2p. Share a token to form a cluster with automatic discovery — no central server required. Supports federated load balancing and worker-mode weight sharding.
+
+**Best for:** ad-hoc clusters, community sharing, quick experimentation.
+
+[Read more]({{% relref "features/distributed_inferencing" %}})
+
+## Quick Comparison
+
+| | P2P / Federation | Distributed Mode |
+|---|---|---|
+| **Discovery** | Automatic via libp2p token | Self-registration to frontend URL |
+| **State storage** | In-memory / ledger | PostgreSQL |
+| **Coordination** | Gossip protocol | NATS messaging |
+| **Node management** | Automatic | REST API + WebUI |
+| **Setup complexity** | Minimal (share a token) | Requires PostgreSQL + NATS |
--- a/docs/content/features/embeddings.md
+++ b/docs/content/features/embeddings.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "🧠 Embeddings"
+title = "Embeddings"
 weight = 13
 url = "/features/embeddings/"
 +++
--- a/docs/content/features/gpt-vision.md
+++ b/docs/content/features/gpt-vision.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "🥽 GPT Vision"
+title = "GPT Vision"
 weight = 14
 url = "/features/gpt-vision/"
 +++
--- a/docs/content/features/image-generation.md
+++ b/docs/content/features/image-generation.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "🎨 Image generation"
+title = "Image Generation"
 weight = 12
 url = "/features/image-generation/"
 +++
--- a/docs/content/features/mcp.md
+++ b/docs/content/features/mcp.md
@@ -1,5 +1,5 @@
 +++
-title = "🔗 Model Context Protocol (MCP)"
+title = "Model Context Protocol (MCP)"
 weight = 20
 toc = true
 description = "Agentic capabilities with Model Context Protocol integration"
--- a/docs/content/features/model-gallery.md
+++ b/docs/content/features/model-gallery.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "🖼️ Model gallery"
+title = "Model Gallery"
 weight = 18
 url = '/models'
 +++
--- a/docs/content/features/object-detection.md
+++ b/docs/content/features/object-detection.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "🔍 Object detection"
+title = "Object Detection"
 weight = 13
 url = "/features/object-detection/"
 +++
--- a/docs/content/features/openai-functions.md
+++ b/docs/content/features/openai-functions.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "🔥 OpenAI functions and tools"
+title = "OpenAI Functions and Tools"
 weight = 17
 url = "/features/openai-functions/"
 +++
--- a/docs/content/features/reranker.md
+++ b/docs/content/features/reranker.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "📈 Reranker"
+title = "Reranker"
 weight = 11
 url = "/features/reranker/"
 +++
--- a/docs/content/features/runtime-settings.md
+++ b/docs/content/features/runtime-settings.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "⚙️ Runtime Settings"
+title = "Runtime Settings"
 weight = 25
 url = '/features/runtime-settings'
 +++
--- a/docs/content/features/stores.md
+++ b/docs/content/features/stores.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "💾 Stores"
+title = "Stores"
 weight = 18
 url = '/stores'
 +++
--- a/docs/content/features/text-generation.md
+++ b/docs/content/features/text-generation.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "📖 Text generation (GPT)"
+title = "Text Generation (GPT)"
 weight = 10
 url = "/features/text-generation/"
 +++
--- a/docs/content/features/text-to-audio.md
+++ b/docs/content/features/text-to-audio.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "🗣 Text to audio (TTS)"
+title = "Text to Audio (TTS)"
 weight = 11
 url = "/features/text-to-audio/"
 +++
--- a/docs/content/getting-started/quickstart.md
+++ b/docs/content/getting-started/quickstart.md
@@ -119,7 +119,7 @@ For production deployments or when you need more compute, LocalAI supports distr
 - **P2P federation**: Connect multiple LocalAI instances for load-balanced inference
 - **Model sharding**: Split large models across multiple machines

-See the **Nodes** page in the web interface or the [Distribution docs]({{% relref "features/distribute" %}}) for setup instructions.
+See the **Nodes** page in the web interface or the [Distribution docs]({{% relref "features/distribution" %}}) for setup instructions.

 ## What's Next?

--- a/docs/content/integrations.md
+++ b/docs/content/integrations.md
@@ -72,9 +72,10 @@ Feel free to open up a Pull request (by clicking at the "Edit page" below) to ge

 ### Home Automation

- [hass-openai-custom-conversation](https://github.com/drndos/hass-openai-custom-conversation) — Home Assistant integration
- [ha-llmvision](https://github.com/valentinfrlch/ha-llmvision) — Home Assistant LLM Vision
- [HA-LocalAI-Monitor](https://github.com/loryanstrant/HA-LocalAI-Monitor) — Home Assistant monitoring
+- [Extended OpenAI Conversation](https://github.com/jekalmin/extended_openai_conversation) — Conversation agent for Home Assistant that supports a custom OpenAI endpoint
+- [LLM Vision](https://github.com/valentinfrlch/ha-llmvision) — Image & video feed analysis for Home Assistant
+- [OpenAI TTS Speech Service](https://github.com/sfortis/openai_tts) - OpenAI TTS custom component for Home Assistant
+- [LocalAI Monitor](https://github.com/loryanstrant/HA-LocalAI-Monitor) — Monitor & control of LocalAI from Home Assistant
 - Nextcloud [integration plugin](https://apps.nextcloud.com/apps/integration_openai) and [AI assistant](https://apps.nextcloud.com/apps/assistant)

 ### Automation & DevOps
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,38 @@
 ---
+- name: "qwen3.5-35b-a3b-apex"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF
+  description: |
+    Describe the model in a clear and concise way that can be shared in a model gallery.
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    mmproj: llama-cpp/mmproj/Qwen3.5-35B-A3B-APEX-GGUF/mmproj-F16.gguf
+    options:
+      - use_jinja:true
+    parameters:
+      min_p: 0
+      model: llama-cpp/models/Qwen3.5-35B-A3B-APEX-GGUF/Qwen3.5-35B-A3B-APEX-Quality.gguf
+      presence_penalty: 1.5
+      repeat_penalty: 1
+      temperature: 0.7
+      top_k: 20
+      top_p: 0.8
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/mmproj/Qwen3.5-35B-A3B-APEX-GGUF/mmproj-F16.gguf
+      sha256: a516ab92e8240da4734d68352bdfba84c16e830ee40010b8fac80d69c77272ff
+      uri: https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF/resolve/main/mmproj-F16.gguf
+    - filename: llama-cpp/models/Qwen3.5-35B-A3B-APEX-GGUF/Qwen3.5-35B-A3B-APEX-Quality.gguf
+      sha256: 50887b60c77ee5c95bc3657814ae993abcab7b2d71868b9af1e84d6badd09a57
+      uri: https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF/resolve/main/Qwen3.5-35B-A3B-APEX-Quality.gguf
 - name: "qwen_qwen3.5-35b-a3b"
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
--- a/scripts/patch_utils/apply_patches.sh
+++ b/scripts/patch_utils/apply_patches.sh
@@ -1,151 +0,0 @@
-#!/bin/bash
-# apply_patches.sh — Generic patch fetcher and applier for any backend.
-#
-# Usage: ./apply_patches.sh <source-dir> <target-dir>
-#
-#   <source-dir>  Directory containing a patches/ folder (with optional sources.yaml)
-#   <target-dir>  The cloned upstream repo to patch (e.g., llama.cpp/)
-#
-# Behavior (idempotent):
-#   1. If patches/sources.yaml exists and yq is available, for each source:
-#      - If patches/<name>/ already has .patch files: skip fetching (vendored)
-#      - Otherwise: clone the fork at a pinned SHA, diff against the pinned
-#        upstream SHA, and generate patches
-#   2. Apply all patches (skips already-applied ones)
-#   3. Fails fast on any patch application error
-#
-# sources.yaml fields:
-#   name         — subdirectory name for this source's patches
-#   repo         — fork git URL
-#   version_var  — Makefile variable holding the pinned fork commit SHA
-#   base_var     — Makefile variable holding the pinned upstream commit SHA
-#   version_file — Makefile path (relative to backend dir)
-
-set -e
-
-# Use /tmp for patch temp files to avoid macOS long-path issues
-export TMPDIR="${TMPDIR_OVERRIDE:-/tmp}"
-
-read_makefile_var() {
-    grep -m1 "^${1}?=" "$2" | cut -d'=' -f2
-}
-
-apply_one_patch() {
-    local target_dir="$1"
-    local patch_file="$2"
-    local label="$3"
-
-    if patch -d "$target_dir" -p1 --reverse --dry-run < "$patch_file" >/dev/null 2>&1; then
-        echo "  Already applied, skipping: $label"
-        return 0
-    fi
-
-    echo "  Applying: $label"
-    patch -d "$target_dir" -p1 --forward < "$patch_file" || { echo "FAILED: $patch_file"; exit 1; }
-}
-
-apply_patches() {
-    local SOURCE_DIR="$(cd "$1" && pwd)"
-    local TARGET_DIR="$2"
-    local PATCHES_DIR="$SOURCE_DIR/patches"
-
-    if [ ! -d "$PATCHES_DIR" ]; then
-        return 0
-    fi
-
-    # Phase 1: Generate missing patches from fork sources
-    if [ -f "$PATCHES_DIR/sources.yaml" ] && command -v yq &>/dev/null; then
-        local SOURCE_COUNT
-        SOURCE_COUNT=$(yq '.sources | length' "$PATCHES_DIR/sources.yaml")
-
-        for i in $(seq 0 $((SOURCE_COUNT - 1))); do
-            local NAME REPO VERSION_VAR BASE_VAR VERSION_FILE
-            NAME=$(yq ".sources[$i].name" "$PATCHES_DIR/sources.yaml")
-            REPO=$(yq ".sources[$i].repo" "$PATCHES_DIR/sources.yaml")
-            VERSION_VAR=$(yq ".sources[$i].version_var" "$PATCHES_DIR/sources.yaml")
-            BASE_VAR=$(yq ".sources[$i].base_var" "$PATCHES_DIR/sources.yaml")
-            VERSION_FILE=$(yq ".sources[$i].version_file" "$PATCHES_DIR/sources.yaml")
-
-            local MAKEFILE="$SOURCE_DIR/$VERSION_FILE"
-            local FORK_SHA BASE_SHA
-            FORK_SHA=$(read_makefile_var "$VERSION_VAR" "$MAKEFILE")
-            BASE_SHA=$(read_makefile_var "$BASE_VAR" "$MAKEFILE")
-
-            if [ -z "$FORK_SHA" ] || [ -z "$BASE_SHA" ]; then
-                echo "WARNING: Could not read $VERSION_VAR or $BASE_VAR from $MAKEFILE — skipping '$NAME'"
-                continue
-            fi
-
-            local SOURCE_PATCH_DIR="$PATCHES_DIR/$NAME"
-            local EXISTING
-            EXISTING=$(ls "$SOURCE_PATCH_DIR"/*.patch 2>/dev/null | wc -l)
-
-            if [ "$EXISTING" -gt 0 ]; then
-                echo "Patches [$NAME]: $EXISTING patches already present — skipping fetch."
-            else
-                echo "Patches [$NAME]: generating from $REPO"
-                echo "  base (upstream): ${BASE_SHA:0:12}"
-                echo "  head (fork):     ${FORK_SHA:0:12}"
-
-                local TMPDIR_CLONE
-                TMPDIR_CLONE=$(mktemp -d)
-
-                if git clone "$REPO" "$TMPDIR_CLONE/fork" 2>&1; then
-                    cd "$TMPDIR_CLONE/fork"
-
-                    # Fetch the upstream base commit (may not be in the fork's history)
-                    git fetch origin "$FORK_SHA" 2>&1 || true
-                    git checkout "$FORK_SHA" 2>&1
-
-                    # We need the base commit in the history to compute the diff.
-                    # If the fork is a real GitHub fork, it shares history with upstream.
-                    # Otherwise, fetch it explicitly.
-                    if ! git cat-file -e "$BASE_SHA" 2>/dev/null; then
-                        echo "  Base commit not in fork history — fetching from upstream"
-                        local UPSTREAM_URL
-                        # Derive upstream URL from base_var context or use llama.cpp default
-                        UPSTREAM_URL=$(yq ".sources[$i].upstream_repo // \"\"" "$PATCHES_DIR/sources.yaml")
-                        if [ -n "$UPSTREAM_URL" ] && [ "$UPSTREAM_URL" != "null" ]; then
-                            git remote add upstream "$UPSTREAM_URL" 2>/dev/null || true
-                            git fetch upstream 2>&1
-                        fi
-                    fi
-
-                    local PATCH_COUNT
-                    PATCH_COUNT=$(git rev-list --count "$BASE_SHA".."$FORK_SHA" 2>/dev/null || echo "0")
-                    echo "  $PATCH_COUNT commits in diff"
-
-                    if [ "$PATCH_COUNT" -gt 0 ]; then
-                        mkdir -p "$SOURCE_PATCH_DIR"
-                        git format-patch "$BASE_SHA".."$FORK_SHA" -o "$SOURCE_PATCH_DIR/" >/dev/null 2>&1
-                        echo "  Generated $PATCH_COUNT patches in patches/$NAME/"
-                    fi
-                    cd "$SOURCE_DIR"
-                else
-                    echo "WARNING: Failed to clone $REPO — skipping source '$NAME'"
-                fi
-
-                rm -rf "$TMPDIR_CLONE"
-            fi
-        done
-    elif [ -f "$PATCHES_DIR/sources.yaml" ]; then
-        echo "WARNING: yq not found — skipping source-based patch generation."
-    fi
-
-    # Phase 2: Apply patches (subdirectories first, then top-level)
-    for source_dir in $(find "$PATCHES_DIR" -mindepth 1 -maxdepth 1 -type d | sort); do
-        for p in $(ls "$source_dir"/*.patch 2>/dev/null | sort); do
-            apply_one_patch "$TARGET_DIR" "$p" "$(basename "$source_dir")/$(basename "$p")"
-        done
-    done
-    for p in $(ls "$PATCHES_DIR"/*.patch 2>/dev/null | sort); do
-        apply_one_patch "$TARGET_DIR" "$p" "$(basename "$p")"
-    done
-}
-
-# Run with arguments
-if [ $# -lt 2 ]; then
-    echo "Usage: $0 <source-dir> <target-dir>"
-    exit 1
-fi
-apply_patches "$1" "$2"
Author	SHA1	Message	Date
Ettore Di Giacinto	6e11f882f7	feat(turboquant.cpp): add new backend Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-04-03 20:57:15 +00:00
Ettore Di Giacinto	8577bdcebc	Update asset links in README.md Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2026-04-03 10:24:08 +02:00
Ettore Di Giacinto	0d489c7a0d	Add guided tour and update screenshots section Updated README to include a guided tour section with links to various assets and details about agents and usage metrics. Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2026-04-03 10:23:03 +02:00
Ettore Di Giacinto	11dc54bda9	fix(docs): commit distribution.md Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-04-03 10:14:13 +02:00
Ettore Di Giacinto	7e0b73deaa	fix(docs): fix broken references to distributed mode Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-04-03 09:46:06 +02:00
LocalAI [bot]	c0a023d13d	chore: ⬆️ Update ggml-org/llama.cpp to `a1cfb645307edc61a89e41557f290f441043d3c2` (#9203 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-04-03 08:30:15 +02:00
Loryan Strant	0d3ae1c295	docs: Update Home Assistant integrations list (#9206 ) Update Home Assistant integrations list Signed-off-by: Loryan Strant <51473494+loryanstrant@users.noreply.github.com>	2026-04-03 08:30:00 +02:00
LocalAI [bot]	e9f10f2f50	chore(model gallery): 🤖 add 1 new models via gallery agent (#9202 ) chore(model gallery): 🤖 add new models via gallery agent Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-04-02 21:22:19 +02:00
Ettore Di Giacinto	b95b0b72ff	chore(ci): fix gallery agent Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-04-02 18:02:18 +00:00
LocalAI [bot]	26f1b94f4d	chore: ⬆️ Update ggml-org/llama.cpp to `95a6ebabb277c4cc18247e7bc2a5502133caca63` (#9199 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-04-02 08:53:16 +02:00
LocalAI [bot]	2d40725ca2	chore: ⬆️ Update leejet/stable-diffusion.cpp to `87ecb95cbc65dc8e58e3d88f4f4a59a0939796f5` (#9200 ) ⬆️ Update leejet/stable-diffusion.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-04-02 08:53:04 +02:00