deterministic builds

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
refactor, macOS fixes
2026-05-24 08:38:02 -04:00 · 2026-04-01 19:45:31 +00:00 · 2026-04-01 19:42:16 +00:00 · 2026-04-01 17:57:03 +00:00
45 changed files with 270 additions and 436 deletions
--- a/.github/gallery-agent/agent.go
+++ b/.github/gallery-agent/agent.go
@@ -133,7 +133,6 @@ func getRealReadme(ctx context.Context, repository string) (string, error) {
 	result, err := cogito.ExecuteTools(llm, fragment,
 		cogito.WithIterations(3),
 		cogito.WithMaxAttempts(3),
-		cogito.DisableSinkState,
 		cogito.WithTools(&HFReadmeTool{client: hfapi.NewClient()}))
 	if err != nil {
 		return "", err
--- a/.github/gallery-agent/gallery.go
+++ b/.github/gallery-agent/gallery.go
@@ -79,20 +79,7 @@ func generateYAMLEntry(model ProcessedModel, quantization string) string {
 	description = cleanTextContent(description)
 	formattedDescription := formatTextContent(description)

-	// Strip name and description from config file since they are
-	// already present at the gallery entry level and should not
-	// appear under overrides.
-	configFileContent := modelConfig.ConfigFile
-	var cfgMap map[string]any
-	if err := yaml.Unmarshal([]byte(configFileContent), &cfgMap); err == nil {
-		delete(cfgMap, "name")
-		delete(cfgMap, "description")
-		if cleaned, err := yaml.Marshal(cfgMap); err == nil {
-			configFileContent = string(cleaned)
-		}
-	}
-
-	configFile := formatTextContent(configFileContent)
+	configFile := formatTextContent(modelConfig.ConfigFile)

 	filesYAML, _ := yaml.Marshal(modelConfig.Files)

--- a/.github/gallery-agent/testing.go
+++ b/.github/gallery-agent/testing.go
@@ -17,7 +17,7 @@ func runSyntheticMode() error {
 	fmt.Printf("Generating %d synthetic models for testing...\n", numModels)

 	var models []ProcessedModel
-	for range numModels {
+	for i := range numModels {
 		model := generator.GenerateProcessedModel()
 		models = append(models, model)
 		fmt.Printf("Generated synthetic model: %s\n", model.ModelID)
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -1828,98 +1828,6 @@ jobs:
            dockerfile: "./backend/Dockerfile.llama-cpp"
            context: "./"
            ubuntu-version: '2404'
-          # llama-cpp-tq (TurboQuant fork)
-          - build-type: ''
-            cuda-major-version: ""
-            cuda-minor-version: ""
-            platforms: 'linux/amd64,linux/arm64'
-            tag-latest: 'auto'
-            tag-suffix: '-cpu-llama-cpp-tq'
-            runs-on: 'bigger-runner'
-            base-image: "ubuntu:24.04"
-            skip-drivers: 'false'
-            backend: "llama-cpp-tq"
-            dockerfile: "./backend/Dockerfile.llama-cpp"
-            context: "./"
-            ubuntu-version: '2404'
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "8"
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp-tq'
-            runs-on: 'bigger-runner'
-            base-image: "ubuntu:24.04"
-            skip-drivers: 'false'
-            backend: "llama-cpp-tq"
-            dockerfile: "./backend/Dockerfile.llama-cpp"
-            context: "./"
-            ubuntu-version: '2404'
-          - build-type: 'cublas'
-            cuda-major-version: "13"
-            cuda-minor-version: "0"
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp-tq'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:24.04"
-            skip-drivers: 'false'
-            backend: "llama-cpp-tq"
-            dockerfile: "./backend/Dockerfile.llama-cpp"
-            context: "./"
-            ubuntu-version: '2404'
-          - build-type: 'cublas'
-            cuda-major-version: "13"
-            cuda-minor-version: "0"
-            platforms: 'linux/arm64'
-            skip-drivers: 'false'
-            tag-latest: 'auto'
-            tag-suffix: '-nvidia-l4t-cuda-13-arm64-llama-cpp-tq'
-            base-image: "ubuntu:24.04"
-            runs-on: 'ubuntu-24.04-arm'
-            ubuntu-version: '2404'
-            backend: "llama-cpp-tq"
-            dockerfile: "./backend/Dockerfile.llama-cpp"
-            context: "./"
-          - build-type: 'hipblas'
-            cuda-major-version: ""
-            cuda-minor-version: ""
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-rocm-hipblas-llama-cpp-tq'
-            runs-on: 'ubuntu-latest'
-            base-image: "rocm/dev-ubuntu-24.04:6.4.4"
-            skip-drivers: 'false'
-            backend: "llama-cpp-tq"
-            dockerfile: "./backend/Dockerfile.llama-cpp"
-            context: "./"
-            ubuntu-version: '2404'
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "0"
-            platforms: 'linux/arm64'
-            skip-drivers: 'false'
-            tag-latest: 'auto'
-            tag-suffix: '-nvidia-l4t-arm64-llama-cpp-tq'
-            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
-            runs-on: 'ubuntu-24.04-arm'
-            backend: "llama-cpp-tq"
-            dockerfile: "./backend/Dockerfile.llama-cpp"
-            context: "./"
-            ubuntu-version: '2204'
-          - build-type: 'vulkan'
-            cuda-major-version: ""
-            cuda-minor-version: ""
-            platforms: 'linux/amd64,linux/arm64'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-vulkan-llama-cpp-tq'
-            runs-on: 'bigger-runner'
-            base-image: "ubuntu:24.04"
-            skip-drivers: 'false'
-            backend: "llama-cpp-tq"
-            dockerfile: "./backend/Dockerfile.llama-cpp"
-            context: "./"
-            ubuntu-version: '2404'
          # Stablediffusion-ggml
          - build-type: ''
            cuda-major-version: ""
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -15,10 +15,9 @@ jobs:
            branch: "master"
            file: "backend/cpp/llama-cpp/Makefile"
          - repository: "TheTom/llama-cpp-turboquant"
-            variable: "LLAMA_VERSION"
-            branch: "master"
-            file: "backend/cpp/llama-cpp-tq/Makefile"
-            branch_suffix: "-tq"
+            variable: "TURBOQUANT_VERSION"
+            branch: "feature/turboquant-kv-cache"
+            file: "backend/cpp/llama-cpp/Makefile"
          - repository: "ggml-org/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
@@ -65,9 +64,6 @@ jobs:
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update ${{ matrix.repository }}'
          title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
-          branch: "update/${{ matrix.variable }}${{ matrix.branch_suffix }}"
+          branch: "update/${{ matrix.variable }}"
          body: ${{ steps.bump.outputs.message }}
          signoff: true
-
-
-
--- a/.github/workflows/gallery-agent.yaml
+++ b/.github/workflows/gallery-agent.yaml
@@ -55,7 +55,7 @@ jobs:
      - name: Run gallery agent
        env:
          #OPENAI_MODEL: ${{ secrets.OPENAI_MODEL }}
-          OPENAI_MODEL: Qwen3.5-2B-GGUF
+          OPENAI_MODE: Qwen3.5-2B-GGUF
          OPENAI_BASE_URL: "http://localhost:8080"
          OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
          #OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,6 @@ prepare-sources
 /backend/cpp/llama-cpp/llama.cpp
 /backend/cpp/llama-*
 !backend/cpp/llama-cpp
-!backend/cpp/llama-cpp-tq
 /backends
 /backend-images
 /result.yaml
--- a/4
+++ b/4
@@ -544,9 +544,8 @@ backend-images:
 	mkdir -p backend-images

 # Backend metadata: BACKEND_NAME | DOCKERFILE_TYPE | BUILD_CONTEXT | PROGRESS_FLAG | NEEDS_BACKEND_ARG
-# llama-cpp and forks - use llama-cpp Dockerfile
+# llama-cpp is special - uses llama-cpp Dockerfile and doesn't need BACKEND arg
 BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false
-BACKEND_LLAMA_CPP_TQ = llama-cpp-tq|llama-cpp|.|false|true

 # Golang backends
 BACKEND_PIPER = piper|golang|.|false|true
@@ -610,7 +609,6 @@ endef

 # Generate all docker-build targets
 $(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
-$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_TQ)))
 $(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
 $(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
--- a/README.md
+++ b/README.md
@@ -42,38 +42,16 @@ Created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

 > [:book: Documentation](https://localai.io/) | [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) | [💻 Quickstart](https://localai.io/basics/getting_started/) | [🖼️ Models](https://models.localai.io/) | [❓FAQ](https://localai.io/faq/)

-## Guided tour
+## Screenshots
+
+### Chat, Model gallery

 https://github.com/user-attachments/assets/08cbb692-57da-48f7-963d-2e7b43883c18

-<details>
-
-<summary>
-Click to see more!
-</summary>
-
-#### User and auth
-
-https://github.com/user-attachments/assets/228fa9ad-81a3-4d43-bfb9-31557e14a36c
-
-#### Agents
+### Agents

 https://github.com/user-attachments/assets/6270b331-e21d-4087-a540-6290006b381a

-#### Usage metrics per user
-
-https://github.com/user-attachments/assets/cbb03379-23b4-4e3d-bd26-d152f057007f
-
-#### Fine-tuning and Quantization
-
-https://github.com/user-attachments/assets/5ba4ace9-d3df-4795-b7d4-b0b404ea71ee
-
-#### WebRTC
-
-https://github.com/user-attachments/assets/ed88e34c-fed3-4b83-8a67-4716a9feeb7b
-
-</details>
-
 ## Quickstart

 ### macOS
--- a/backend/Dockerfile.llama-cpp
+++ b/backend/Dockerfile.llama-cpp
@@ -58,9 +58,7 @@ ARG CUDA_DOCKER_ARCH
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 ARG CMAKE_ARGS
 ENV CMAKE_ARGS=${CMAKE_ARGS}
-ARG BACKEND=llama-cpp
-ARG LLAMA_BACKEND_DIR=${BACKEND}
-ENV LLAMA_BACKEND_DIR=${LLAMA_BACKEND_DIR}
+ARG BACKEND=rerankers
 ARG BUILD_TYPE
 ENV BUILD_TYPE=${BUILD_TYPE}
 ARG CUDA_MAJOR_VERSION
@@ -257,27 +255,32 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
  export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
-  rm -rf /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}-*-build
+  rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
 fi

-cd /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}
-
 if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
-  make ARCH=aarch64 build-variants
+  cd /LocalAI/backend/cpp/llama-cpp
+  make llama-cpp-fallback
+  make llama-cpp-grpc
+  make llama-cpp-rpc-server
 else
-  make build-variants
+  cd /LocalAI/backend/cpp/llama-cpp
+  make llama-cpp-avx
+  make llama-cpp-avx2
+  make llama-cpp-avx512
+  make llama-cpp-fallback
+  make llama-cpp-grpc
+  make llama-cpp-rpc-server
 fi
 EOT


 # Copy libraries using a script to handle architecture differences
-RUN make -BC /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR} package
+RUN make -BC /LocalAI/backend/cpp/llama-cpp package


 FROM scratch

-ARG BACKEND=llama-cpp
-ARG LLAMA_BACKEND_DIR=${BACKEND}

 # Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
-COPY --from=builder /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}/package/. ./
+COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./
--- a/backend/cpp/llama-cpp-tq/Makefile
+++ b/backend/cpp/llama-cpp-tq/Makefile
@@ -1,6 +0,0 @@
-LLAMA_VERSION?=master
-LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant
-BACKEND_NAME?=llama-cpp-tq
-SHARED_DIR?=$(CURDIR)/../llama-cpp
-
-include ../llama-cpp/Makefile
--- a/backend/cpp/llama-cpp/CMakeLists.txt
+++ b/backend/cpp/llama-cpp/CMakeLists.txt
@@ -59,11 +59,6 @@ add_library(hw_grpc_proto

 add_executable(${TARGET} grpc-server.cpp json.hpp httplib.h)

-# Enable autoparser support if the header exists (not present in all llama.cpp forks)
-if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/chat-auto-parser.h")
-    target_compile_definitions(${TARGET} PRIVATE HAS_AUTOPARSER)
-endif()
-
 target_include_directories(${TARGET} PRIVATE ../llava)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})

--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,10 +1,8 @@

-LLAMA_VERSION?=a1cfb645307edc61a89e41557f290f441043d3c2
+LLAMA_VERSION?=0fcb3760b2b9a3a496ef14621a7e4dad7a8df90f
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
-BACKEND_NAME?=llama-cpp
-SHARED_DIR?=$(CURDIR)
-GRPC_SERVER_DIR?=tools/grpc-server
-SERVER_SOURCE_DIR?=tools/server
+
+TURBOQUANT_VERSION?=8ad0f00e9a38df6c29fc10363341dde300f92ae4

 CMAKE_ARGS?=
 BUILD_TYPE?=
@@ -71,17 +69,6 @@ ifeq ($(BUILD_TYPE),sycl_f32)
 		-DCMAKE_CXX_FLAGS="-fsycl"
 endif

-# Variants to build for each architecture (can be overridden by forks)
-X86_64_VARIANTS ?= llama-cpp-avx llama-cpp-avx2 llama-cpp-avx512 llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
-ARM64_VARIANTS ?= llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
-
-build-variants:
-ifeq ($(ARCH),aarch64)
-	@for v in $(ARM64_VARIANTS); do $(MAKE) $$v || exit 1; done
-else
-	@for v in $(X86_64_VARIANTS); do $(MAKE) $$v || exit 1; done
-endif
-
 INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
 INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
 ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
@@ -105,42 +92,42 @@ else
 endif

 llama-cpp-avx2: llama.cpp
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build
-	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build purge
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
 	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx2-build" build-llama-cpp-grpc-server
-	cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build/grpc-server llama-cpp-avx2
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2

 llama-cpp-avx512: llama.cpp
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build
-	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build purge
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
 	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx512-build" build-llama-cpp-grpc-server
-	cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build/grpc-server llama-cpp-avx512
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512

 llama-cpp-avx: llama.cpp
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build
-	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build purge
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
 	$(info ${GREEN}I llama-cpp build info:avx${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-avx-build" build-llama-cpp-grpc-server
-	cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build/grpc-server llama-cpp-avx
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx

 llama-cpp-fallback: llama.cpp
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build
-	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build purge
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-fallback-build" build-llama-cpp-grpc-server
-	cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build/grpc-server llama-cpp-fallback
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback

 llama-cpp-grpc: llama.cpp
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build
-	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build purge
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
 	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="$(BACKEND_NAME)-grpc-build" build-llama-cpp-grpc-server
-	cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/grpc-server llama-cpp-grpc
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc

 llama-cpp-rpc-server: llama-cpp-grpc
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server

 llama.cpp:
 	mkdir -p llama.cpp
@@ -148,30 +135,30 @@ llama.cpp:
 	git init && \
 	git remote add origin $(LLAMA_REPO)  && \
 	git fetch origin && \
-	(git checkout -b build $(LLAMA_VERSION) || git checkout -b build origin/$(LLAMA_VERSION)) && \
+	git checkout -b build $(LLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch

-llama.cpp/$(GRPC_SERVER_DIR): llama.cpp
-	mkdir -p llama.cpp/$(GRPC_SERVER_DIR)
-	SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
+llama.cpp/tools/grpc-server: llama.cpp
+	mkdir -p llama.cpp/tools/grpc-server
+	bash prepare.sh

 rebuild:
-	SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
+	bash prepare.sh
 	rm -rf grpc-server
 	$(MAKE) grpc-server

 package:
-	bash $(SHARED_DIR)/package.sh
+	bash package.sh

 purge:
 	rm -rf llama.cpp/build
-	rm -rf llama.cpp/$(GRPC_SERVER_DIR)
+	rm -rf llama.cpp/tools/grpc-server
 	rm -rf grpc-server

 clean: purge
 	rm -rf llama.cpp

-grpc-server: llama.cpp llama.cpp/$(GRPC_SERVER_DIR)
+grpc-server: llama.cpp llama.cpp/tools/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -17,9 +17,7 @@
 #include "backend.pb.h"
 #include "backend.grpc.pb.h"
 #include "common.h"
-#ifdef HAS_AUTOPARSER
 #include "chat-auto-parser.h"
-#endif
 #include <getopt.h>
 #include <grpcpp/ext/proto_server_reflection_plugin.h>
 #include <grpcpp/grpcpp.h>
@@ -2667,7 +2665,6 @@ public:
        
        response->set_rendered_template(rendered_template);

-#ifdef HAS_AUTOPARSER
        // Run differential template analysis to detect tool format markers
        if (params_base.use_jinja) {
            try {
@@ -2773,7 +2770,6 @@ public:
                SRV_WRN("ModelMetadata: failed to run autoparser analysis: %s\n", e.what());
            }
        }
-#endif

        return grpc::Status::OK;
    }
--- a/backend/cpp/llama-cpp/package.sh
+++ b/backend/cpp/llama-cpp/package.sh
@@ -5,21 +5,14 @@

 set -e

-# Use working directory (not script location) so forks that share this script work correctly
-CURDIR=$(pwd)
-SCRIPT_DIR=$(dirname "$(realpath $0)")
-REPO_ROOT="${SCRIPT_DIR}/../../.."
+CURDIR=$(dirname "$(realpath $0)")
+REPO_ROOT="${CURDIR}/../../.."

 # Create lib directory
 mkdir -p $CURDIR/package/lib

 cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
-# Copy run.sh — prefer local copy, fall back to shared dir (script location)
-if [ -f "$CURDIR/run.sh" ]; then
-    cp -rfv $CURDIR/run.sh $CURDIR/package/
-else
-    cp -rfv $SCRIPT_DIR/run.sh $CURDIR/package/
-fi
+cp -rfv $CURDIR/run.sh $CURDIR/package/

 # Detect architecture and copy appropriate libraries
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
--- a/backend/cpp/llama-cpp/patches/sources.yaml
+++ b/backend/cpp/llama-cpp/patches/sources.yaml
@@ -0,0 +1,14 @@
+# Patch sources for the llama-cpp backend.
+# Each source declares a fork whose commits are extracted as patches
+# and applied on top of upstream llama.cpp during the build.
+# See scripts/patch_utils/apply_patches.sh for the generic patch engine.
+#
+# version_var: Makefile variable with the pinned fork commit SHA
+# base_var:    Makefile variable with the upstream base commit SHA
+# Both are read from version_file (relative to backend dir) to compute the diff.
+sources:
+  - name: turboquant
+    repo: https://github.com/TheTom/llama-cpp-turboquant.git
+    version_var: TURBOQUANT_VERSION
+    base_var: LLAMA_VERSION
+    version_file: Makefile
--- a/backend/cpp/llama-cpp/prepare.sh
+++ b/backend/cpp/llama-cpp/prepare.sh
@@ -1,43 +1,26 @@
 #!/bin/bash
-
-SHARED_DIR="${SHARED_DIR:-.}"
-SERVER_SOURCE_DIR="${SERVER_SOURCE_DIR:-tools/server}"
-GRPC_SERVER_DIR="${GRPC_SERVER_DIR:-tools/grpc-server}"
-
-## Apply patches from the `patches` directory
-if [ -d "patches" ]; then
-    for patch in $(ls patches); do
-        echo "Applying patch $patch"
-        patch -d llama.cpp/ -p1 < patches/$patch
-    done
-fi
-
 set -e

-# Copy server source files into grpc-server build directory
-for file in $(ls llama.cpp/${SERVER_SOURCE_DIR}/); do
-    cp -rfv llama.cpp/${SERVER_SOURCE_DIR}/$file llama.cpp/${GRPC_SERVER_DIR}/
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$SCRIPT_DIR/../../.."
+
+## Apply patches from sources and/or local .patch files
+"$REPO_ROOT/scripts/patch_utils/apply_patches.sh" "$SCRIPT_DIR" llama.cpp
+
+## Copy server files into grpc-server build directory
+for file in $(ls llama.cpp/tools/server/); do
+    cp -rfv llama.cpp/tools/server/$file llama.cpp/tools/grpc-server/
 done

-# Copy build files — prefer local overrides, fall back to SHARED_DIR
-for f in CMakeLists.txt grpc-server.cpp; do
-    if [ -f "$f" ]; then
-        cp -r "$f" llama.cpp/${GRPC_SERVER_DIR}/
-    else
-        cp -r "$SHARED_DIR/$f" llama.cpp/${GRPC_SERVER_DIR}/
-    fi
-done
-
-cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/${GRPC_SERVER_DIR}/
-cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/${GRPC_SERVER_DIR}/
-
-# Add grpc-server subdirectory to the parent CMakeLists.txt
-PARENT_CMAKELISTS="llama.cpp/$(dirname ${GRPC_SERVER_DIR})/CMakeLists.txt"
+cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
+cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/

 set +e
-if grep -q "grpc-server" "$PARENT_CMAKELISTS"; then
+if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
    echo "grpc-server already added"
 else
-    echo "add_subdirectory(grpc-server)" >> "$PARENT_CMAKELISTS"
+    echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
 fi
 set -e
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=87ecb95cbc65dc8e58e3d88f4f4a59a0939796f5
+STABLEDIFFUSION_GGML_VERSION?=09b12d5f6d51d862749e8e0ee8baac8f012089e2

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -29,34 +29,6 @@
    nvidia-cuda-12: "cuda12-llama-cpp"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp"
- &llamacpp_tq
-  name: "llama-cpp-tq"
-  alias: "llama-cpp-tq"
-  license: mit
-  description: |
-    TurboQuant llama.cpp fork - quantization research
-  urls:
-    - https://github.com/TheTom/llama-cpp-turboquant
-  tags:
-    - text-to-text
-    - LLM
-    - CPU
-    - GPU
-    - Metal
-    - CUDA
-    - HIP
-  capabilities:
-    default: "cpu-llama-cpp-tq"
-    nvidia: "cuda12-llama-cpp-tq"
-    intel: "intel-sycl-f16-llama-cpp-tq"
-    amd: "rocm-llama-cpp-tq"
-    metal: "metal-llama-cpp-tq"
-    vulkan: "vulkan-llama-cpp-tq"
-    nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-tq"
-    nvidia-cuda-13: "cuda13-llama-cpp-tq"
-    nvidia-cuda-12: "cuda12-llama-cpp-tq"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-tq"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
 - &whispercpp
  name: "whisper"
  alias: "whisper"
@@ -1280,57 +1252,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp
-# llama-cpp-tq (TurboQuant) concrete backends
- !!merge <<: *llamacpp_tq
-  name: "cpu-llama-cpp-tq"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp-tq"
-  mirrors:
-    - localai/localai-backends:latest-cpu-llama-cpp-tq
- !!merge <<: *llamacpp_tq
-  name: "cuda12-llama-cpp-tq"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq
- !!merge <<: *llamacpp_tq
-  name: "cuda13-llama-cpp-tq"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq
- !!merge <<: *llamacpp_tq
-  name: "rocm-llama-cpp-tq"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq"
-  mirrors:
-    - localai/localai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq
- !!merge <<: *llamacpp_tq
-  name: "intel-sycl-f16-llama-cpp-tq"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq
- !!merge <<: *llamacpp_tq
-  name: "intel-sycl-f32-llama-cpp-tq"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq
- !!merge <<: *llamacpp_tq
-  name: "vulkan-llama-cpp-tq"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-llama-cpp-tq"
-  mirrors:
-    - localai/localai-backends:latest-gpu-vulkan-llama-cpp-tq
- !!merge <<: *llamacpp_tq
-  name: "metal-llama-cpp-tq"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp-tq"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-llama-cpp-tq
- !!merge <<: *llamacpp_tq
-  name: "nvidia-l4t-arm64-llama-cpp-tq"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq
- !!merge <<: *llamacpp_tq
-  name: "cuda13-nvidia-l4t-arm64-llama-cpp-tq"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq
 ## whisper
 - !!merge <<: *whispercpp
  name: "nvidia-l4t-arm64-whisper"
--- a/docs/content/features/GPU-acceleration.md
+++ b/docs/content/features/GPU-acceleration.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "GPU Acceleration"
+title = "⚡ GPU acceleration"
 weight = 9
 url = "/features/gpu-acceleration/"
 +++
--- a/docs/content/features/_index.en.md
+++ b/docs/content/features/_index.en.md
@@ -27,7 +27,8 @@ LocalAI provides a comprehensive set of features for running AI models locally.
 - **[Realtime API](openai-realtime/)** - Low-latency multi-modal conversations (voice+text) over WebSocket
 - **[Constrained Grammars](constrained_grammars/)** - Control model output format with BNF grammars
 - **[GPU Acceleration](GPU-acceleration/)** - Optimize performance with GPU support
- **[Distribution](distribution/)** - Scale inference across multiple nodes (P2P federation or production distributed mode)
+- **[Distributed Inference](distributed_inferencing/)** - Scale inference across multiple nodes
+- **[Distributed Mode](distributed-mode/)** - Horizontal scaling with PostgreSQL, NATS, and remote backend nodes
 - **[P2P API](p2p/)** - Monitor and manage P2P worker and federated nodes
 - **[Model Context Protocol (MCP)](mcp/)** - Enable agentic capabilities with MCP integration
 - **[Agents](agents/)** - Autonomous AI agents with tools, knowledge base, and skills
--- a/docs/content/features/agents.md
+++ b/docs/content/features/agents.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "Agents"
+title = "🤖 Agents"
 weight = 21
 url = '/features/agents'
 +++
--- a/docs/content/features/audio-to-text.md
+++ b/docs/content/features/audio-to-text.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "Audio to Text"
+title = "🔈 Audio to text"
 weight = 16
 url = "/features/audio-to-text/"
 +++
--- a/docs/content/features/authentication.md
+++ b/docs/content/features/authentication.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "Authentication & Authorization"
+title = "🔐 Authentication & Authorization"
 weight = 26
 url = '/features/authentication'
 +++
--- a/docs/content/features/backends.md
+++ b/docs/content/features/backends.md
@@ -1,5 +1,5 @@
 ---
-title: "Backends"
+title: "⚙️ Backends"
 description: "Learn how to use, manage, and develop backends in LocalAI"
 weight: 4
 url: "/backends/"
--- a/docs/content/features/constrained_grammars.md
+++ b/docs/content/features/constrained_grammars.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "Constrained Grammars"
+title = "✍️ Constrained Grammars"
 weight = 15
 url = "/features/constrained_grammars/"
 +++
--- a/docs/content/features/distributed-mode.md
+++ b/docs/content/features/distributed-mode.md
@@ -5,7 +5,7 @@ weight = 14
 url = "/features/distributed-mode/"
 +++

-Distributed mode enables horizontal scaling of LocalAI across multiple machines using **PostgreSQL** for state and node registry, and **NATS** for real-time coordination. Unlike the [P2P/federation approach]({{% relref "features/distributed_inferencing" %}}), distributed mode is designed for production deployments and Kubernetes environments where you need centralized management, health monitoring, and deterministic routing.
+Distributed mode enables horizontal scaling of LocalAI across multiple machines using **PostgreSQL** for state and node registry, and **NATS** for real-time coordination. Unlike the [P2P/federation approach](/features/distribute/), distributed mode is designed for production deployments and Kubernetes environments where you need centralized management, health monitoring, and deterministic routing.

 {{% notice note %}}
 Distributed mode requires authentication enabled with a **PostgreSQL** database — SQLite is not supported. This is because the node registry, job store, and other distributed state are stored in PostgreSQL tables.
--- a/docs/content/features/distributed_inferencing.md
+++ b/docs/content/features/distributed_inferencing.md
@@ -1,12 +1,12 @@
 +++
 disableToc = false
-title = "P2P / Federated Inference"
+title = "🆕🖧 Distributed Inference"
 weight = 15
 url = "/features/distribute/"
 +++

 {{% notice tip %}}
-Looking for production-grade horizontal scaling with PostgreSQL and NATS? See [Distributed Mode]({{% relref "features/distributed-mode" %}}).
+Looking for production-grade horizontal scaling with PostgreSQL and NATS? See [Distributed Mode](/features/distributed-mode/).
 {{% /notice %}}

 This functionality enables LocalAI to distribute inference requests across multiple worker nodes, improving efficiency and performance. Nodes are automatically discovered and connect via p2p by using a shared token which makes sure the communication is secure and private between the nodes of the network.
--- a/docs/content/features/distribution.md
+++ b/docs/content/features/distribution.md
@@ -1,34 +0,0 @@
-+++
-disableToc = false
-title = "Distribution"
-weight = 13
-url = "/features/distribution/"
-+++
-
-LocalAI supports distributing inference workloads across multiple machines. There are two approaches, each suited to different use cases:
-
-## Distributed Mode (PostgreSQL + NATS)
-
-Production-grade horizontal scaling with centralized management. Frontends are stateless LocalAI instances behind a load balancer; workers self-register and receive backends dynamically via NATS. State lives in PostgreSQL.
-
-**Best for:** production deployments, Kubernetes, managed infrastructure.
-
-[Read more]({{% relref "features/distributed-mode" %}})
-
-## P2P / Federated Inference
-
-Peer-to-peer networking via libp2p. Share a token to form a cluster with automatic discovery — no central server required. Supports federated load balancing and worker-mode weight sharding.
-
-**Best for:** ad-hoc clusters, community sharing, quick experimentation.
-
-[Read more]({{% relref "features/distributed_inferencing" %}})
-
-## Quick Comparison
-
-| | P2P / Federation | Distributed Mode |
-|---|---|---|
-| **Discovery** | Automatic via libp2p token | Self-registration to frontend URL |
-| **State storage** | In-memory / ledger | PostgreSQL |
-| **Coordination** | Gossip protocol | NATS messaging |
-| **Node management** | Automatic | REST API + WebUI |
-| **Setup complexity** | Minimal (share a token) | Requires PostgreSQL + NATS |
--- a/docs/content/features/embeddings.md
+++ b/docs/content/features/embeddings.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "Embeddings"
+title = "🧠 Embeddings"
 weight = 13
 url = "/features/embeddings/"
 +++
--- a/docs/content/features/gpt-vision.md
+++ b/docs/content/features/gpt-vision.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "GPT Vision"
+title = "🥽 GPT Vision"
 weight = 14
 url = "/features/gpt-vision/"
 +++
--- a/docs/content/features/image-generation.md
+++ b/docs/content/features/image-generation.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "Image Generation"
+title = "🎨 Image generation"
 weight = 12
 url = "/features/image-generation/"
 +++
--- a/docs/content/features/mcp.md
+++ b/docs/content/features/mcp.md
@@ -1,5 +1,5 @@
 +++
-title = "Model Context Protocol (MCP)"
+title = "🔗 Model Context Protocol (MCP)"
 weight = 20
 toc = true
 description = "Agentic capabilities with Model Context Protocol integration"
--- a/docs/content/features/model-gallery.md
+++ b/docs/content/features/model-gallery.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "Model Gallery"
+title = "🖼️ Model gallery"
 weight = 18
 url = '/models'
 +++
--- a/docs/content/features/object-detection.md
+++ b/docs/content/features/object-detection.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "Object Detection"
+title = "🔍 Object detection"
 weight = 13
 url = "/features/object-detection/"
 +++
--- a/docs/content/features/openai-functions.md
+++ b/docs/content/features/openai-functions.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "OpenAI Functions and Tools"
+title = "🔥 OpenAI functions and tools"
 weight = 17
 url = "/features/openai-functions/"
 +++
--- a/docs/content/features/reranker.md
+++ b/docs/content/features/reranker.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "Reranker"
+title = "📈 Reranker"
 weight = 11
 url = "/features/reranker/"
 +++
--- a/docs/content/features/runtime-settings.md
+++ b/docs/content/features/runtime-settings.md
@@ -1,6 +1,6 @@
 +++
 disableToc = false
-title = "Runtime Settings"
+title = "⚙️ Runtime Settings"
 weight = 25
 url = '/features/runtime-settings'
 +++
--- a/docs/content/features/stores.md
+++ b/docs/content/features/stores.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "Stores"
+title = "💾 Stores"
 weight = 18
 url = '/stores'
 +++
--- a/docs/content/features/text-generation.md
+++ b/docs/content/features/text-generation.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "Text Generation (GPT)"
+title = "📖 Text generation (GPT)"
 weight = 10
 url = "/features/text-generation/"
 +++
--- a/docs/content/features/text-to-audio.md
+++ b/docs/content/features/text-to-audio.md
@@ -1,7 +1,7 @@

 +++
 disableToc = false
-title = "Text to Audio (TTS)"
+title = "🗣 Text to audio (TTS)"
 weight = 11
 url = "/features/text-to-audio/"
 +++
--- a/docs/content/getting-started/quickstart.md
+++ b/docs/content/getting-started/quickstart.md
@@ -119,7 +119,7 @@ For production deployments or when you need more compute, LocalAI supports distr
 - **P2P federation**: Connect multiple LocalAI instances for load-balanced inference
 - **Model sharding**: Split large models across multiple machines

-See the **Nodes** page in the web interface or the [Distribution docs]({{% relref "features/distribution" %}}) for setup instructions.
+See the **Nodes** page in the web interface or the [Distribution docs]({{% relref "features/distribute" %}}) for setup instructions.

 ## What's Next?

--- a/docs/content/integrations.md
+++ b/docs/content/integrations.md
@@ -72,10 +72,9 @@ Feel free to open up a Pull request (by clicking at the "Edit page" below) to ge

 ### Home Automation

- [Extended OpenAI Conversation](https://github.com/jekalmin/extended_openai_conversation) — Conversation agent for Home Assistant that supports a custom OpenAI endpoint
- [LLM Vision](https://github.com/valentinfrlch/ha-llmvision) — Image & video feed analysis for Home Assistant
- [OpenAI TTS Speech Service](https://github.com/sfortis/openai_tts) - OpenAI TTS custom component for Home Assistant
- [LocalAI Monitor](https://github.com/loryanstrant/HA-LocalAI-Monitor) — Monitor & control of LocalAI from Home Assistant
+- [hass-openai-custom-conversation](https://github.com/drndos/hass-openai-custom-conversation) — Home Assistant integration
+- [ha-llmvision](https://github.com/valentinfrlch/ha-llmvision) — Home Assistant LLM Vision
+- [HA-LocalAI-Monitor](https://github.com/loryanstrant/HA-LocalAI-Monitor) — Home Assistant monitoring
 - Nextcloud [integration plugin](https://apps.nextcloud.com/apps/integration_openai) and [AI assistant](https://apps.nextcloud.com/apps/assistant)

 ### Automation & DevOps
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,38 +1,4 @@
 ---
- name: "qwen3.5-35b-a3b-apex"
-  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
-  urls:
-    - https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF
-  description: |
-    Describe the model in a clear and concise way that can be shared in a model gallery.
-  overrides:
-    backend: llama-cpp
-    function:
-      automatic_tool_parsing_fallback: true
-      grammar:
-        disable: true
-    known_usecases:
-      - chat
-    mmproj: llama-cpp/mmproj/Qwen3.5-35B-A3B-APEX-GGUF/mmproj-F16.gguf
-    options:
-      - use_jinja:true
-    parameters:
-      min_p: 0
-      model: llama-cpp/models/Qwen3.5-35B-A3B-APEX-GGUF/Qwen3.5-35B-A3B-APEX-Quality.gguf
-      presence_penalty: 1.5
-      repeat_penalty: 1
-      temperature: 0.7
-      top_k: 20
-      top_p: 0.8
-    template:
-      use_tokenizer_template: true
-  files:
-    - filename: llama-cpp/mmproj/Qwen3.5-35B-A3B-APEX-GGUF/mmproj-F16.gguf
-      sha256: a516ab92e8240da4734d68352bdfba84c16e830ee40010b8fac80d69c77272ff
-      uri: https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF/resolve/main/mmproj-F16.gguf
-    - filename: llama-cpp/models/Qwen3.5-35B-A3B-APEX-GGUF/Qwen3.5-35B-A3B-APEX-Quality.gguf
-      sha256: 50887b60c77ee5c95bc3657814ae993abcab7b2d71868b9af1e84d6badd09a57
-      uri: https://huggingface.co/mudler/Qwen3.5-35B-A3B-APEX-GGUF/resolve/main/Qwen3.5-35B-A3B-APEX-Quality.gguf
 - name: "qwen_qwen3.5-35b-a3b"
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
--- a/scripts/patch_utils/apply_patches.sh
+++ b/scripts/patch_utils/apply_patches.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+# apply_patches.sh — Generic patch fetcher and applier for any backend.
+#
+# Usage: ./apply_patches.sh <source-dir> <target-dir>
+#
+#   <source-dir>  Directory containing a patches/ folder (with optional sources.yaml)
+#   <target-dir>  The cloned upstream repo to patch (e.g., llama.cpp/)
+#
+# Behavior (idempotent):
+#   1. If patches/sources.yaml exists and yq is available, for each source:
+#      - If patches/<name>/ already has .patch files: skip fetching (vendored)
+#      - Otherwise: clone the fork at a pinned SHA, diff against the pinned
+#        upstream SHA, and generate patches
+#   2. Apply all patches (skips already-applied ones)
+#   3. Fails fast on any patch application error
+#
+# sources.yaml fields:
+#   name         — subdirectory name for this source's patches
+#   repo         — fork git URL
+#   version_var  — Makefile variable holding the pinned fork commit SHA
+#   base_var     — Makefile variable holding the pinned upstream commit SHA
+#   version_file — Makefile path (relative to backend dir)
+
+set -e
+
+# Use /tmp for patch temp files to avoid macOS long-path issues
+export TMPDIR="${TMPDIR_OVERRIDE:-/tmp}"
+
+read_makefile_var() {
+    grep -m1 "^${1}?=" "$2" | cut -d'=' -f2
+}
+
+apply_one_patch() {
+    local target_dir="$1"
+    local patch_file="$2"
+    local label="$3"
+
+    if patch -d "$target_dir" -p1 --reverse --dry-run < "$patch_file" >/dev/null 2>&1; then
+        echo "  Already applied, skipping: $label"
+        return 0
+    fi
+
+    echo "  Applying: $label"
+    patch -d "$target_dir" -p1 --forward < "$patch_file" || { echo "FAILED: $patch_file"; exit 1; }
+}
+
+apply_patches() {
+    local SOURCE_DIR="$(cd "$1" && pwd)"
+    local TARGET_DIR="$2"
+    local PATCHES_DIR="$SOURCE_DIR/patches"
+
+    if [ ! -d "$PATCHES_DIR" ]; then
+        return 0
+    fi
+
+    # Phase 1: Generate missing patches from fork sources
+    if [ -f "$PATCHES_DIR/sources.yaml" ] && command -v yq &>/dev/null; then
+        local SOURCE_COUNT
+        SOURCE_COUNT=$(yq '.sources | length' "$PATCHES_DIR/sources.yaml")
+
+        for i in $(seq 0 $((SOURCE_COUNT - 1))); do
+            local NAME REPO VERSION_VAR BASE_VAR VERSION_FILE
+            NAME=$(yq ".sources[$i].name" "$PATCHES_DIR/sources.yaml")
+            REPO=$(yq ".sources[$i].repo" "$PATCHES_DIR/sources.yaml")
+            VERSION_VAR=$(yq ".sources[$i].version_var" "$PATCHES_DIR/sources.yaml")
+            BASE_VAR=$(yq ".sources[$i].base_var" "$PATCHES_DIR/sources.yaml")
+            VERSION_FILE=$(yq ".sources[$i].version_file" "$PATCHES_DIR/sources.yaml")
+
+            local MAKEFILE="$SOURCE_DIR/$VERSION_FILE"
+            local FORK_SHA BASE_SHA
+            FORK_SHA=$(read_makefile_var "$VERSION_VAR" "$MAKEFILE")
+            BASE_SHA=$(read_makefile_var "$BASE_VAR" "$MAKEFILE")
+
+            if [ -z "$FORK_SHA" ] || [ -z "$BASE_SHA" ]; then
+                echo "WARNING: Could not read $VERSION_VAR or $BASE_VAR from $MAKEFILE — skipping '$NAME'"
+                continue
+            fi
+
+            local SOURCE_PATCH_DIR="$PATCHES_DIR/$NAME"
+            local EXISTING
+            EXISTING=$(ls "$SOURCE_PATCH_DIR"/*.patch 2>/dev/null | wc -l)
+
+            if [ "$EXISTING" -gt 0 ]; then
+                echo "Patches [$NAME]: $EXISTING patches already present — skipping fetch."
+            else
+                echo "Patches [$NAME]: generating from $REPO"
+                echo "  base (upstream): ${BASE_SHA:0:12}"
+                echo "  head (fork):     ${FORK_SHA:0:12}"
+
+                local TMPDIR_CLONE
+                TMPDIR_CLONE=$(mktemp -d)
+
+                if git clone "$REPO" "$TMPDIR_CLONE/fork" 2>&1; then
+                    cd "$TMPDIR_CLONE/fork"
+
+                    # Fetch the upstream base commit (may not be in the fork's history)
+                    git fetch origin "$FORK_SHA" 2>&1 || true
+                    git checkout "$FORK_SHA" 2>&1
+
+                    # We need the base commit in the history to compute the diff.
+                    # If the fork is a real GitHub fork, it shares history with upstream.
+                    # Otherwise, fetch it explicitly.
+                    if ! git cat-file -e "$BASE_SHA" 2>/dev/null; then
+                        echo "  Base commit not in fork history — fetching from upstream"
+                        local UPSTREAM_URL
+                        # Derive upstream URL from base_var context or use llama.cpp default
+                        UPSTREAM_URL=$(yq ".sources[$i].upstream_repo // \"\"" "$PATCHES_DIR/sources.yaml")
+                        if [ -n "$UPSTREAM_URL" ] && [ "$UPSTREAM_URL" != "null" ]; then
+                            git remote add upstream "$UPSTREAM_URL" 2>/dev/null || true
+                            git fetch upstream 2>&1
+                        fi
+                    fi
+
+                    local PATCH_COUNT
+                    PATCH_COUNT=$(git rev-list --count "$BASE_SHA".."$FORK_SHA" 2>/dev/null || echo "0")
+                    echo "  $PATCH_COUNT commits in diff"
+
+                    if [ "$PATCH_COUNT" -gt 0 ]; then
+                        mkdir -p "$SOURCE_PATCH_DIR"
+                        git format-patch "$BASE_SHA".."$FORK_SHA" -o "$SOURCE_PATCH_DIR/" >/dev/null 2>&1
+                        echo "  Generated $PATCH_COUNT patches in patches/$NAME/"
+                    fi
+                    cd "$SOURCE_DIR"
+                else
+                    echo "WARNING: Failed to clone $REPO — skipping source '$NAME'"
+                fi
+
+                rm -rf "$TMPDIR_CLONE"
+            fi
+        done
+    elif [ -f "$PATCHES_DIR/sources.yaml" ]; then
+        echo "WARNING: yq not found — skipping source-based patch generation."
+    fi
+
+    # Phase 2: Apply patches (subdirectories first, then top-level)
+    for source_dir in $(find "$PATCHES_DIR" -mindepth 1 -maxdepth 1 -type d | sort); do
+        for p in $(ls "$source_dir"/*.patch 2>/dev/null | sort); do
+            apply_one_patch "$TARGET_DIR" "$p" "$(basename "$source_dir")/$(basename "$p")"
+        done
+    done
+    for p in $(ls "$PATCHES_DIR"/*.patch 2>/dev/null | sort); do
+        apply_one_patch "$TARGET_DIR" "$p" "$(basename "$p")"
+    done
+}
+
+# Run with arguments
+if [ $# -lt 2 ]; then
+    echo "Usage: $0 <source-dir> <target-dir>"
+    exit 1
+fi
+apply_patches "$1" "$2"