chore(model gallery): add ibm-granite_granite-4.0-micro (#6376 )

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
chore(model gallery): add ibm-granite_granite-4.0-h-micro (#6375 )
2026-02-03 03:02:38 -05:00 · 2025-10-03 10:03:34 +02:00 · 2025-10-03 09:32:20 +02:00 · 2025-10-03 09:31:00 +02:00 · 2025-10-03 09:28:57 +02:00 · 2025-10-02 21:09:00 +00:00
54 changed files with 1051 additions and 226 deletions
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -89,7 +89,7 @@ jobs:
            context: "./backend"
          - build-type: 'l4t'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-l4t-diffusers'
@@ -187,7 +187,7 @@ jobs:
          # CUDA 12 builds
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-rerankers'
@@ -199,7 +199,7 @@ jobs:
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp'
@@ -211,7 +211,7 @@ jobs:
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-vllm'
@@ -223,7 +223,7 @@ jobs:
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-transformers'
@@ -235,7 +235,7 @@ jobs:
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-diffusers'
@@ -248,7 +248,7 @@ jobs:
          # CUDA 12 additional backends
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-kokoro'
@@ -260,7 +260,7 @@ jobs:
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-faster-whisper'
@@ -272,7 +272,7 @@ jobs:
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-coqui'
@@ -284,7 +284,7 @@ jobs:
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-bark'
@@ -296,7 +296,7 @@ jobs:
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-chatterbox'
@@ -489,6 +489,18 @@ jobs:
            backend: "diffusers"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'l4t'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-l4t-kokoro'
+            runs-on: 'ubuntu-24.04-arm'
+            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+            skip-drivers: 'true'
+            backend: "kokoro"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          # SYCL additional backends
          - build-type: 'intel'
            cuda-major-version: ""
@@ -578,7 +590,7 @@ jobs:
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            skip-drivers: 'true'
            tag-latest: 'auto'
@@ -615,7 +627,7 @@ jobs:
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-stablediffusion-ggml'
@@ -675,7 +687,7 @@ jobs:
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            skip-drivers: 'true'
            tag-latest: 'auto'
@@ -700,7 +712,7 @@ jobs:
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-whisper'
@@ -760,7 +772,7 @@ jobs:
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            skip-drivers: 'true'
            tag-latest: 'auto'
@@ -775,7 +787,7 @@ jobs:
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-hipblas-whisper'
+            tag-suffix: '-gpu-rocm-hipblas-whisper'
            base-image: "rocm/dev-ubuntu-22.04:6.4.3"
            runs-on: 'ubuntu-latest'
            skip-drivers: 'false'
@@ -836,7 +848,7 @@ jobs:
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-rfdetr'
@@ -870,9 +882,9 @@ jobs:
            backend: "rfdetr"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
-          - build-type: 'cublas'
+          - build-type: 'l4t'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            skip-drivers: 'true'
            tag-latest: 'auto'
@@ -897,7 +909,7 @@ jobs:
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-exllama2'
@@ -943,6 +955,18 @@ jobs:
            backend: "exllama2"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'l4t'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            skip-drivers: 'true'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-l4t-arm64-chatterbox'
+            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+            runs-on: 'ubuntu-24.04-arm'
+            backend: "chatterbox"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          # runs out of space on the runner
          # - build-type: 'hipblas'
          #   cuda-major-version: ""
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -36,7 +36,7 @@ jobs:
        include:
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-gpu-nvidia-cuda-12'
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -91,7 +91,7 @@ jobs:
            aio: "-aio-gpu-nvidia-cuda-11"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12'
@@ -144,7 +144,7 @@ jobs:
        include:
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "8"
+            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            tag-latest: 'auto'
            tag-suffix: '-nvidia-l4t-arm64'
--- a/.github/workflows/localaibot_automerge.yml
+++ b/.github/workflows/localaibot_automerge.yml
@@ -6,7 +6,8 @@ permissions:
  contents: write
  pull-requests: write
  packages: read
-
+  issues: write # for Homebrew/actions/post-comment
+  actions: write # to dispatch publish workflow
 jobs:
  dependabot:
    runs-on: ubuntu-latest
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.8
+        uses: securego/gosec@v2.22.9
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/12
+++ b/12
@@ -18,7 +18,7 @@ FROM requirements AS requirements-drivers

 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
-ARG CUDA_MINOR_VERSION=8
+ARG CUDA_MINOR_VERSION=0
 ARG SKIP_DRIVERS=false
 ARG TARGETARCH
 ARG TARGETVARIANT
@@ -78,6 +78,16 @@ RUN <<EOT bash
    fi
 EOT

+# https://github.com/NVIDIA/Isaac-GR00T/issues/343
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
+        wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu2204-0.6.0_0.6.0-1_arm64.deb && \
+        dpkg -i cudss-local-tegra-repo-ubuntu2204-0.6.0_0.6.0-1_arm64.deb && \
+        cp /var/cudss-local-tegra-repo-ubuntu2204-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
+        apt-get update && apt-get -y install cudss
+    fi
+EOT
+
 # If we are building with clblas support, we need the libraries for the builds
 RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
--- a/9
+++ b/9
@@ -117,8 +117,8 @@ run: ## run local-ai
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./

 test-models/testmodel.ggml:
-	mkdir test-models
-	mkdir test-dir
+	mkdir -p test-models
+	mkdir -p test-dir
 	wget -q https://huggingface.co/mradermacher/gpt2-alpaca-gpt4-GGUF/resolve/main/gpt2-alpaca-gpt4.Q4_K_M.gguf -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
@@ -170,7 +170,7 @@ prepare-e2e:
 	mkdir -p $(TEST_DIR)
 	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
 	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
-	docker build --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=12 --build-arg CUDA_MINOR_VERSION=8 -t localai-tests .
+	docker build --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=12 --build-arg CUDA_MINOR_VERSION=0 -t localai-tests .

 run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
@@ -429,6 +429,9 @@ docker-build-kitten-tts:
 docker-save-kitten-tts: backend-images
 	docker save local-ai-backend:kitten-tts -o backend-images/kitten-tts.tar

+docker-save-chatterbox: backend-images
+	docker save local-ai-backend:chatterbox -o backend-images/chatterbox.tar
+
 docker-build-kokoro:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend

--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@

 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on 
 [![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -2,10 +2,10 @@ context_size: 4096
 f16: true
 backend: llama-cpp
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
+mmproj: minicpm-v-4_5-mmproj-f16.gguf
 name: gpt-4o
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: minicpm-v-4_5-Q4_K_M.gguf
 stopwords:
 - <|im_end|>
 - <dummy32000>
@@ -42,9 +42,9 @@ template:
    <|im_start|>assistant

 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: minicpm-v-4_5-Q4_K_M.gguf
+  sha256: c1c3c33100b15b4caf7319acce4e23c0eb0ce1cbd12f70e8d24f05aa67b7512f
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-4_5-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/mmproj-model-f16.gguf
+  sha256: 7a7225a32e8d453aaa3d22d8c579b5bf833c253f784cdb05c99c9a76fd616df8
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -2,10 +2,10 @@ context_size: 4096
 backend: llama-cpp
 f16: true
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
+mmproj: minicpm-v-4_5-mmproj-f16.gguf
 name: gpt-4o
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: minicpm-v-4_5-Q4_K_M.gguf
 stopwords:
 - <|im_end|>
 - <dummy32000>
@@ -42,9 +42,9 @@ template:
    <|im_start|>assistant

 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: minicpm-v-4_5-Q4_K_M.gguf
+  sha256: c1c3c33100b15b4caf7319acce4e23c0eb0ce1cbd12f70e8d24f05aa67b7512f
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-4_5-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/mmproj-model-f16.gguf
+  sha256: 7a7225a32e8d453aaa3d22d8c579b5bf833c253f784cdb05c99c9a76fd616df8
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -2,10 +2,10 @@ context_size: 4096
 backend: llama-cpp
 f16: true
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
+mmproj: minicpm-v-4_5-mmproj-f16.gguf
 name: gpt-4o
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: minicpm-v-4_5-Q4_K_M.gguf
 stopwords:
 - <|im_end|>
 - <dummy32000>
@@ -43,9 +43,9 @@ template:


 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: minicpm-v-4_5-Q4_K_M.gguf
+  sha256: c1c3c33100b15b4caf7319acce4e23c0eb0ce1cbd12f70e8d24f05aa67b7512f
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-4_5-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/mmproj-model-f16.gguf
+  sha256: 7a7225a32e8d453aaa3d22d8c579b5bf833c253f784cdb05c99c9a76fd616df8
--- a/backend/README.md
+++ b/backend/README.md
@@ -111,7 +111,7 @@ docker build -f backend/Dockerfile.python \
  --build-arg BACKEND=transformers \
  --build-arg BUILD_TYPE=cublas12 \
  --build-arg CUDA_MAJOR_VERSION=12 \
-  --build-arg CUDA_MINOR_VERSION=8 \
+  --build-arg CUDA_MINOR_VERSION=0 \
  -t localai-backend-transformers .

 # Build Go backend
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=3976dfbe00f02a62c0deca32c46138e4f0ca81d8
+LLAMA_VERSION?=d64c8104f090b27b1f99e8da5995ffcfa6b726e2
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
@@ -14,7 +14,7 @@ CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF

 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF -DLLAMA_OPENSSL=OFF
 endif
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -231,6 +231,7 @@ static void params_parse(const backend::ModelOptions* request,
    params.cpuparams.n_threads = request->threads();
    params.n_gpu_layers = request->ngpulayers();
    params.n_batch = request->nbatch();
+    params.n_ubatch = request->nbatch(); // fixes issue with reranking models being limited to 512 tokens (the default n_ubatch size); allows for setting the maximum input amount of tokens thereby avoiding this error "input is too large to process. increase the physical batch size"
    // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
    //params.n_parallel = 1;
    const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
@@ -701,7 +702,7 @@ public:
        */

        // for the shape of input/content, see tokenize_input_prompts()
-        json prompt = body.at("prompt");
+        json prompt = body.at("embeddings");


        auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
@@ -712,6 +713,7 @@ public:
            }
        }

+        int embd_normalize = 2; // default to Euclidean/L2 norm
        // create and queue the task
        json responses = json::array();
        bool error = false;
@@ -725,9 +727,8 @@ public:
                task.index         = i;
                task.prompt_tokens = std::move(tokenized_prompts[i]);

-                // OAI-compat
-                task.params.oaicompat = OAICOMPAT_TYPE_EMBEDDING;
-
+                task.params.oaicompat = OAICOMPAT_TYPE_NONE;
+                task.params.embd_normalize = embd_normalize;
                tasks.push_back(std::move(task));
            }

@@ -743,9 +744,8 @@ public:
                responses.push_back(res->to_json());
            }
        }, [&](const json & error_data) {
-            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, error_data.value("content", ""));
+            error = true;
        }, [&]() {
-            // NOTE: we should try to check when the writer is closed here
            return false;
        });

@@ -755,12 +755,36 @@ public:
            return grpc::Status(grpc::StatusCode::INTERNAL, "Error in receiving results");
        }

-        std::vector<float> embeddings = responses[0].value("embedding", std::vector<float>());
-        // loop the vector and set the embeddings results
-        for (int i = 0; i < embeddings.size(); i++) {
-            embeddingResult->add_embeddings(embeddings[i]);
+        std::cout << "[DEBUG] Responses size: " << responses.size() << std::endl;
+        
+        // Process the responses and extract embeddings
+        for (const auto & response_elem : responses) {
+            // Check if the response has an "embedding" field
+            if (response_elem.contains("embedding")) {
+                json embedding_data = json_value(response_elem, "embedding", json::array());
+                
+                if (embedding_data.is_array() && !embedding_data.empty()) {
+                    for (const auto & embedding_vector : embedding_data) {
+                        if (embedding_vector.is_array()) {
+                            for (const auto & embedding_value : embedding_vector) {
+                                embeddingResult->add_embeddings(embedding_value.get<float>());
+                            }
+                        }
+                    }
+                }
+            } else {
+                // Check if the response itself contains the embedding data directly
+                if (response_elem.is_array()) {
+                    for (const auto & embedding_value : response_elem) {
+                        embeddingResult->add_embeddings(embedding_value.get<float>());
+                    }
+                }
+            }
        }

+
+    
+
        return grpc::Status::OK;
    }

@@ -778,11 +802,6 @@ public:
            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"documents\" must be a non-empty string array");
        }

-        // Tokenize the query
-        auto tokenized_query = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, request->query(), /* add_special */ false, true);
-        if (tokenized_query.size() != 1) {
-            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"query\" must contain only a single prompt");
-        }
        // Create and queue the task
        json responses = json::array();
        bool error = false;
@@ -794,10 +813,9 @@ public:
                documents.push_back(request->documents(i));
            }
            
-            auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
-            tasks.reserve(tokenized_docs.size());
-            for (size_t i = 0; i < tokenized_docs.size(); i++) {
-                auto tmp = format_rerank(ctx_server.vocab, tokenized_query[0], tokenized_docs[i]);
+            tasks.reserve(documents.size());
+            for (size_t i = 0; i < documents.size(); i++) {
+                auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, request->query(), documents[i]);
                server_task task = server_task(SERVER_TASK_TYPE_RERANK);
                task.id = ctx_server.queue_tasks.get_new_id();
                task.index = i;
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=b0179181069254389ccad604e44f17a2c25b4094
+STABLEDIFFUSION_GGML_VERSION?=0ebe6fe118f125665939b27c89f34ed38716bff8

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/stablediffusion-ggml/gosd.cpp
@@ -4,17 +4,11 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-#include <iostream>
-#include <random>
 #include <string>
 #include <vector>
 #include <filesystem>
 #include "gosd.h"

-// #include "preprocessing.hpp"
-#include "flux.hpp"
-#include "stable-diffusion.h"
-
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
 #include "stb_image.h"
@@ -29,7 +23,7 @@

 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
 const char* sample_method_str[] = {
-    "euler_a",
+    "default",
    "euler",
    "heun",
    "dpm2",
@@ -41,8 +35,11 @@ const char* sample_method_str[] = {
    "lcm",
    "ddim_trailing",
    "tcd",
+    "euler_a",
 };

+static_assert(std::size(sample_method_str) == SAMPLE_METHOD_COUNT, "sample method mismatch");
+
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
 const char* schedulers[] = {
    "default",
@@ -51,8 +48,11 @@ const char* schedulers[] = {
    "exponential",
    "ays",
    "gits",
+    "smoothstep",
 };

+static_assert(std::size(schedulers) == SCHEDULE_COUNT, "schedulers mismatch");
+
 sd_ctx_t* sd_c;
 // Moved from the context (load time) to generation time params
 scheduler_t scheduler = scheduler_t::DEFAULT;
@@ -168,7 +168,7 @@ int load_model(const char *model, char *model_path, char* options[], int threads
    }
    if (sample_method_found == -1) {
        fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
-        sample_method_found = EULER_A;
+        sample_method_found = sample_method_t::SAMPLE_METHOD_DEFAULT;
    }
    sample_method = (sample_method_t)sample_method_found;

@@ -192,9 +192,7 @@ int load_model(const char *model, char *model_path, char* options[], int threads
    ctx_params.control_net_path = "";
    ctx_params.lora_model_dir = lora_dir;
    ctx_params.embedding_dir = "";
-    ctx_params.stacked_id_embed_dir = "";
    ctx_params.vae_decode_only = false;
-    ctx_params.vae_tiling = false;
    ctx_params.free_params_immediately = false;
    ctx_params.n_threads = threads;
    ctx_params.rng_type = STD_DEFAULT_RNG;
@@ -220,7 +218,49 @@ int load_model(const char *model, char *model_path, char* options[], int threads
    return 0;
 }

-int gen_image(char *text, char *negativeText, int width, int height, int steps, int64_t seed, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count) {
+void sd_tiling_params_set_enabled(sd_tiling_params_t *params, bool enabled) {
+    params->enabled = enabled;
+}
+
+void sd_tiling_params_set_tile_sizes(sd_tiling_params_t *params, int tile_size_x, int tile_size_y) {
+    params->tile_size_x = tile_size_x;
+    params->tile_size_y = tile_size_y;
+}
+
+void sd_tiling_params_set_rel_sizes(sd_tiling_params_t *params, float rel_size_x, float rel_size_y) {
+    params->rel_size_x = rel_size_x;
+    params->rel_size_y = rel_size_y;
+}
+
+void sd_tiling_params_set_target_overlap(sd_tiling_params_t *params, float target_overlap) {
+    params->target_overlap = target_overlap;
+}
+
+sd_tiling_params_t* sd_img_gen_params_get_vae_tiling_params(sd_img_gen_params_t *params) {
+    return &params->vae_tiling_params;
+}
+
+sd_img_gen_params_t* sd_img_gen_params_new(void) {
+    sd_img_gen_params_t *params = (sd_img_gen_params_t *)std::malloc(sizeof(sd_img_gen_params_t));
+    sd_img_gen_params_init(params);
+    return params;
+}
+
+void sd_img_gen_params_set_prompts(sd_img_gen_params_t *params, const char *prompt, const char *negative_prompt) {
+    params->prompt = prompt;
+    params->negative_prompt = negative_prompt;
+}
+
+void sd_img_gen_params_set_dimensions(sd_img_gen_params_t *params, int width, int height) {
+    params->width = width;
+    params->height = height;
+}
+
+void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed) {
+    params->seed = seed;
+}
+
+int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count) {

    sd_image_t* results;

@@ -228,21 +268,15 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,

    fprintf (stderr, "Generating image\n");

-    sd_img_gen_params_t p;
-    sd_img_gen_params_init(&p);
+    p->sample_params.guidance.txt_cfg = cfg_scale;
+    p->sample_params.guidance.slg.layers = skip_layers.data();
+    p->sample_params.guidance.slg.layer_count = skip_layers.size();
+    p->sample_params.sample_method = sample_method;
+    p->sample_params.sample_steps = steps;
+    p->sample_params.scheduler = scheduler;

-    p.prompt = text;
-    p.negative_prompt = negativeText;
-    p.sample_params.guidance.txt_cfg = cfg_scale;
-    p.sample_params.guidance.slg.layers = skip_layers.data();
-    p.sample_params.guidance.slg.layer_count = skip_layers.size();
-    p.width = width;
-    p.height = height;
-    p.sample_params.sample_method = sample_method;
-    p.sample_params.sample_steps = steps;
-    p.seed = seed;
-    p.input_id_images_path = "";
-    p.sample_params.scheduler = scheduler;
+    int width = p->width;
+    int height = p->height;

    // Handle input image for img2img
    bool has_input_image = (src_image != NULL && strlen(src_image) > 0);
@@ -291,13 +325,13 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
            input_image_buffer = resized_image_buffer;
        }

-        p.init_image = {(uint32_t)width, (uint32_t)height, 3, input_image_buffer};
-        p.strength = strength;
+        p->init_image = {(uint32_t)width, (uint32_t)height, 3, input_image_buffer};
+        p->strength = strength;
        fprintf(stderr, "Using img2img with strength: %.2f\n", strength);
    } else {
        // No input image, use empty image for text-to-image
-        p.init_image = {(uint32_t)width, (uint32_t)height, 3, NULL};
-        p.strength = 0.0f;
+        p->init_image = {(uint32_t)width, (uint32_t)height, 3, NULL};
+        p->strength = 0.0f;
    }

    // Handle mask image for inpainting
@@ -337,12 +371,12 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
            mask_image_buffer = resized_mask_buffer;
        }

-        p.mask_image = {(uint32_t)width, (uint32_t)height, 1, mask_image_buffer};
+        p->mask_image = {(uint32_t)width, (uint32_t)height, 1, mask_image_buffer};
        fprintf(stderr, "Using inpainting with mask\n");
    } else {
        // No mask image, create default full mask
        default_mask_image_vec.resize(width * height, 255);
-        p.mask_image = {(uint32_t)width, (uint32_t)height, 1, default_mask_image_vec.data()};
+        p->mask_image = {(uint32_t)width, (uint32_t)height, 1, default_mask_image_vec.data()};
    }

    // Handle reference images
@@ -400,13 +434,15 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
        }

        if (!ref_images_vec.empty()) {
-            p.ref_images = ref_images_vec.data();
-            p.ref_images_count = ref_images_vec.size();
+            p->ref_images = ref_images_vec.data();
+            p->ref_images_count = ref_images_vec.size();
            fprintf(stderr, "Using %zu reference images\n", ref_images_vec.size());
        }
    }

-    results = generate_image(sd_c, &p);
+    results = generate_image(sd_c, p);
+
+    std::free(p);

    if (results == NULL) {
        fprintf (stderr, "NO results\n");
--- a/backend/go/stablediffusion-ggml/gosd.go
+++ b/backend/go/stablediffusion-ggml/gosd.go
@@ -22,7 +22,18 @@ type SDGGML struct {

 var (
 	LoadModel func(model, model_apth string, options []uintptr, threads int32, diff int) int
-	GenImage  func(text, negativeText string, width, height, steps int, seed int64, dst string, cfgScale float32, srcImage string, strength float32, maskImage string, refImages []string, refImagesCount int) int
+	GenImage  func(params uintptr, steps int, dst string, cfgScale float32, srcImage string, strength float32, maskImage string, refImages []string, refImagesCount int) int
+
+	TilingParamsSetEnabled       func(params uintptr, enabled bool)
+	TilingParamsSetTileSizes     func(params uintptr, tileSizeX int, tileSizeY int)
+	TilingParamsSetRelSizes      func(params uintptr, relSizeX float32, relSizeY float32)
+	TilingParamsSetTargetOverlap func(params uintptr, targetOverlap float32)
+
+	ImgGenParamsNew                func() uintptr
+	ImgGenParamsSetPrompts         func(params uintptr, prompt string, negativePrompt string)
+	ImgGenParamsSetDimensions      func(params uintptr, width int, height int)
+	ImgGenParamsSetSeed            func(params uintptr, seed int64)
+	ImgGenParamsGetVaeTilingParams func(params uintptr) uintptr
 )

 // Copied from Purego internal/strings
@@ -120,7 +131,15 @@ func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
 	// Default strength for img2img (0.75 is a good default)
 	strength := float32(0.75)

-	ret := GenImage(t, negative, int(opts.Width), int(opts.Height), int(opts.Step), int64(opts.Seed), dst, sd.cfgScale, srcImage, strength, maskImage, refImages, refImagesCount)
+	// free'd by GenImage
+	p := ImgGenParamsNew()
+	ImgGenParamsSetPrompts(p, t, negative)
+	ImgGenParamsSetDimensions(p, int(opts.Width), int(opts.Height))
+	ImgGenParamsSetSeed(p, int64(opts.Seed))
+	vaep := ImgGenParamsGetVaeTilingParams(p)
+	TilingParamsSetEnabled(vaep, false)
+
+	ret := GenImage(p, int(opts.Step), dst, sd.cfgScale, srcImage, strength, maskImage, refImages, refImagesCount)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
--- a/backend/go/stablediffusion-ggml/gosd.h
+++ b/backend/go/stablediffusion-ggml/gosd.h
@@ -1,8 +1,23 @@
+#include <cstdint>
+#include "stable-diffusion.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
+
+void sd_tiling_params_set_enabled(sd_tiling_params_t *params, bool enabled);
+void sd_tiling_params_set_tile_sizes(sd_tiling_params_t *params, int tile_size_x, int tile_size_y);
+void sd_tiling_params_set_rel_sizes(sd_tiling_params_t *params, float rel_size_x, float rel_size_y);
+void sd_tiling_params_set_target_overlap(sd_tiling_params_t *params, float target_overlap);
+sd_tiling_params_t* sd_img_gen_params_get_vae_tiling_params(sd_img_gen_params_t *params);
+
+sd_img_gen_params_t* sd_img_gen_params_new(void);
+void sd_img_gen_params_set_prompts(sd_img_gen_params_t *params, const char *prompt, const char *negative_prompt);
+void sd_img_gen_params_set_dimensions(sd_img_gen_params_t *params, int width, int height);
+void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed);
+
 int load_model(const char *model, char *model_path, char* options[], int threads, int diffusionModel);
-int gen_image(char *text, char *negativeText, int width, int height, int steps, int64_t seed, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count);
+int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count);
 #ifdef __cplusplus
 }
 #endif
--- a/backend/go/stablediffusion-ggml/main.go
+++ b/backend/go/stablediffusion-ggml/main.go
@@ -11,14 +11,35 @@ var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )

+type LibFuncs struct {
+	FuncPtr any
+	Name    string
+}
+
 func main() {
 	gosd, err := purego.Dlopen("./libgosd.so", purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
 		panic(err)
 	}

-	purego.RegisterLibFunc(&LoadModel, gosd, "load_model")
-	purego.RegisterLibFunc(&GenImage, gosd, "gen_image")
+	libFuncs := []LibFuncs{
+		{&LoadModel, "load_model"},
+		{&GenImage, "gen_image"},
+		{&TilingParamsSetEnabled, "sd_tiling_params_set_enabled"},
+		{&TilingParamsSetTileSizes, "sd_tiling_params_set_tile_sizes"},
+		{&TilingParamsSetRelSizes, "sd_tiling_params_set_rel_sizes"},
+		{&TilingParamsSetTargetOverlap, "sd_tiling_params_set_target_overlap"},
+
+		{&ImgGenParamsNew, "sd_img_gen_params_new"},
+		{&ImgGenParamsSetPrompts, "sd_img_gen_params_set_prompts"},
+		{&ImgGenParamsSetDimensions, "sd_img_gen_params_set_dimensions"},
+		{&ImgGenParamsSetSeed, "sd_img_gen_params_set_seed"},
+		{&ImgGenParamsGetVaeTilingParams, "sd_img_gen_params_get_vae_tiling_params"},
+	}
+
+	for _, lf := range libFuncs {
+		purego.RegisterLibFunc(lf.FuncPtr, gosd, lf.Name)
+	}

 	flag.Parse()

--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=edea8a9c3cf0eb7676dcdb604991eb2f95c3d984
+WHISPER_CPP_VERSION?=7849aff7a2e1f4234aa31b01a1870906d5431959

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -270,6 +270,7 @@
    nvidia: "cuda12-kokoro"
    intel: "intel-kokoro"
    amd: "rocm-kokoro"
+    nvidia-l4t: "nvidia-l4t-kokoro"
 - &coqui
  urls:
    - https://github.com/idiap/coqui-ai-TTS
@@ -352,6 +353,7 @@
    nvidia: "cuda12-chatterbox"
    metal: "metal-chatterbox"
    default: "cpu-chatterbox"
+    nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
 - &piper
  name: "piper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
@@ -1049,6 +1051,7 @@
    nvidia: "cuda12-kokoro-development"
    intel: "intel-kokoro-development"
    amd: "rocm-kokoro-development"
+    nvidia-l4t: "nvidia-l4t-kokoro-development"
 - !!merge <<: *kokoro
  name: "cuda11-kokoro-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-kokoro"
@@ -1074,6 +1077,16 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-kokoro"
  mirrors:
    - localai/localai-backends:master-gpu-intel-kokoro
+- !!merge <<: *kokoro
+  name: "nvidia-l4t-kokoro"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-kokoro"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-l4t-kokoro
+- !!merge <<: *kokoro
+  name: "nvidia-l4t-kokoro-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-kokoro"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-l4t-kokoro
 - !!merge <<: *kokoro
  name: "cuda11-kokoro"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-kokoro"
@@ -1227,6 +1240,7 @@
    nvidia: "cuda12-chatterbox-development"
    metal: "metal-chatterbox-development"
    default: "cpu-chatterbox-development"
+    nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
 - !!merge <<: *chatterbox
  name: "cpu-chatterbox"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-chatterbox"
@@ -1237,6 +1251,16 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-chatterbox"
  mirrors:
    - localai/localai-backends:master-cpu-chatterbox
+- !!merge <<: *chatterbox
+  name: "nvidia-l4t-arm64-chatterbox"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox
+- !!merge <<: *chatterbox
+  name: "nvidia-l4t-arm64-chatterbox-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-arm64-chatterbox"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-l4t-arm64-chatterbox
 - !!merge <<: *chatterbox
  name: "metal-chatterbox"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-chatterbox"
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 certifi
--- a/backend/python/chatterbox/backend.py
+++ b/backend/python/chatterbox/backend.py
@@ -14,9 +14,23 @@ import backend_pb2_grpc
 import torch
 import torchaudio as ta
 from chatterbox.tts import ChatterboxTTS
-
+from chatterbox.mtl_tts import ChatterboxMultilingualTTS
 import grpc

+def is_float(s):
+    """Check if a string can be converted to float."""
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+def is_int(s):
+    """Check if a string can be converted to int."""
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@@ -47,6 +61,28 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if not torch.cuda.is_available() and request.CUDA:
            return backend_pb2.Result(success=False, message="CUDA is not available")

+
+        options = request.Options
+
+        # empty dict
+        self.options = {}
+
+        # The options are a list of strings in this form optname:optvalue
+        # We are storing all the options in a dict so we can use it later when
+        # generating the images
+        for opt in options:
+            if ":" not in opt:
+                continue
+            key, value = opt.split(":")
+            # if value is a number, convert it to the appropriate type
+            if is_float(value):
+                value = float(value)
+            elif is_int(value):
+                value = int(value)
+            elif value.lower() in ["true", "false"]:
+                value = value.lower() == "true"
+            self.options[key] = value
+
        self.AudioPath = None

        if os.path.isabs(request.AudioPath):
@@ -56,10 +92,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            modelFileBase = os.path.dirname(request.ModelFile)
            # modify LoraAdapter to be relative to modelFileBase
            self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
-
        try:
            print("Preparing models, please wait", file=sys.stderr)
-            self.model = ChatterboxTTS.from_pretrained(device=device)
+            if "multilingual" in self.options:
+                # remove key from options
+                del self.options["multilingual"]
+                self.model = ChatterboxMultilingualTTS.from_pretrained(device=device)
+            else:
+                self.model = ChatterboxTTS.from_pretrained(device=device)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
@@ -68,12 +108,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

    def TTS(self, request, context):
        try:
-            # Generate audio using ChatterboxTTS
+            kwargs = {}
+
+            if "language" in self.options:
+                kwargs["language_id"] = self.options["language"]
            if self.AudioPath is not None:
-                wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath)
-            else:
-                wav = self.model.generate(request.text)
-            
+                kwargs["audio_prompt_path"] = self.AudioPath
+
+            # add options to kwargs
+            kwargs.update(self.options)
+
+            # Generate audio using ChatterboxTTS
+            wav = self.model.generate(request.text, **kwargs)
            # Save the generated audio
            ta.save(request.dst, wav, self.model.sr)
            
--- a/backend/python/chatterbox/install.sh
+++ b/backend/python/chatterbox/install.sh
@@ -15,5 +15,6 @@ fi
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
+EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"

 installRequirements
--- a/backend/python/chatterbox/requirements-cpu.txt
+++ b/backend/python/chatterbox/requirements-cpu.txt
@@ -1,6 +1,8 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch==2.6.0
-torchaudio==2.6.0
-transformers==4.46.3
-chatterbox-tts==0.1.2
+torch
+torchaudio
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+#chatterbox-tts==0.1.4
--- a/backend/python/chatterbox/requirements-cublas11.txt
+++ b/backend/python/chatterbox/requirements-cublas11.txt
@@ -2,5 +2,6 @@
 torch==2.6.0+cu118
 torchaudio==2.6.0+cu118
 transformers==4.46.3
-chatterbox-tts==0.1.2
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
--- a/backend/python/chatterbox/requirements-cublas12.txt
+++ b/backend/python/chatterbox/requirements-cublas12.txt
@@ -1,5 +1,6 @@
-torch==2.6.0
-torchaudio==2.6.0
-transformers==4.46.3
-chatterbox-tts==0.1.2
+torch
+torchaudio
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
--- a/backend/python/chatterbox/requirements-hipblas.txt
+++ b/backend/python/chatterbox/requirements-hipblas.txt
@@ -1,6 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.6.0+rocm6.1
 torchaudio==2.6.0+rocm6.1
-transformers==4.46.3
-chatterbox-tts==0.1.2
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
--- a/backend/python/chatterbox/requirements-intel.txt
+++ b/backend/python/chatterbox/requirements-intel.txt
@@ -2,8 +2,9 @@
 intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
 torchaudio==2.3.1+cxx11.abi
-transformers==4.46.3
-chatterbox-tts==0.1.2
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
--- a/backend/python/chatterbox/requirements-l4t.txt
+++ b/backend/python/chatterbox/requirements-l4t.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126/
+torch
+torchaudio
+transformers
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+accelerate
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 grpcio-tools
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -66,11 +66,20 @@ from diffusers.schedulers import (
 )

 def is_float(s):
+    """Check if a string can be converted to float."""
    try:
        float(s)
        return True
    except ValueError:
        return False
+def is_int(s):
+    """Check if a string can be converted to int."""
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+

 # The scheduler list mapping was taken from here: https://github.com/neggles/animatediff-cli/blob/6f336f5f4b5e38e85d7f06f1744ef42d0a45f2a7/src/animatediff/schedulers.py#L39
 # Credits to https://github.com/neggles
@@ -177,10 +186,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                key, value = opt.split(":")
                # if value is a number, convert it to the appropriate type
                if is_float(value):
-                    if value.is_integer():
-                        value = int(value)
-                    else:
-                        value = float(value)
+                    value = float(value)
+                elif is_int(value):
+                    value = int(value)
+                elif value.lower() in ["true", "false"]:
+                    value = value.lower() == "true"
                self.options[key] = value

            # From options, extract if present "torch_dtype" and set it to the appropriate type
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.74.0
+grpcio==1.75.1
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 certifi
 wheel
--- a/backend/python/kokoro/requirements-l4t.txt
+++ b/backend/python/kokoro/requirements-l4t.txt
@@ -0,0 +1,7 @@
+--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126/
+torch
+torchaudio
+transformers
+accelerate
+kokoro
+soundfile
--- a/backend/python/mlx-audio/backend.py
+++ b/backend/python/mlx-audio/backend.py
@@ -20,6 +20,21 @@ import soundfile as sf
 import numpy as np
 import uuid

+def is_float(s):
+    """Check if a string can be converted to float."""
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+def is_int(s):
+    """Check if a string can be converted to int."""
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -32,14 +47,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    This backend provides TTS (Text-to-Speech) functionality using MLX-Audio.
    """

-    def _is_float(self, s):
-        """Check if a string can be converted to float."""
-        try:
-            float(s)
-            return True
-        except ValueError:
-            return False
-
    def Health(self, request, context):
        """
        Returns a health check message.
@@ -80,11 +87,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
                
                # Convert numeric values to appropriate types
-                if self._is_float(value):
-                    if float(value).is_integer():
-                        value = int(value)
-                    else:
-                        value = float(value)
+                if is_float(value):
+                    value = float(value)
+                elif is_int(value):
+                    value = int(value)
                elif value.lower() in ["true", "false"]:
                    value = value.lower() == "true"
                    
--- a/backend/python/mlx-vlm/backend.py
+++ b/backend/python/mlx-vlm/backend.py
@@ -21,6 +21,21 @@ import io
 from PIL import Image
 import tempfile

+def is_float(s):
+    """Check if a string can be converted to float."""
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+def is_int(s):
+    """Check if a string can be converted to int."""
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -32,14 +47,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    A gRPC servicer that implements the Backend service defined in backend.proto.
    """

-    def _is_float(self, s):
-        """Check if a string can be converted to float."""
-        try:
-            float(s)
-            return True
-        except ValueError:
-            return False
-
    def Health(self, request, context):
        """
        Returns a health check message.
@@ -79,12 +86,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    continue
                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
                
-                # Convert numeric values to appropriate types
-                if self._is_float(value):
-                    if float(value).is_integer():
-                        value = int(value)
-                    else:
-                        value = float(value)
+                if is_float(value):
+                    value = float(value)
+                elif is_int(value):
+                    value = int(value)
                elif value.lower() in ["true", "false"]:
                    value = value.lower() == "true"
                    
--- a/backend/python/mlx/backend.py
+++ b/backend/python/mlx/backend.py
@@ -24,20 +24,27 @@ _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))

+def is_float(s):
+    """Check if a string can be converted to float."""
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+def is_int(s):
+    """Check if a string can be converted to int."""
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    A gRPC servicer that implements the Backend service defined in backend.proto.
    """

-    def _is_float(self, s):
-        """Check if a string can be converted to float."""
-        try:
-            float(s)
-            return True
-        except ValueError:
-            return False
-
    def Health(self, request, context):
        """
        Returns a health check message.
@@ -78,11 +85,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
                
                # Convert numeric values to appropriate types
-                if self._is_float(value):
-                    if float(value).is_integer():
-                        value = int(value)
-                    else:
-                        value = float(value)
+                if is_float(value):
+                    value = float(value)
+                elif is_int(value):
+                    value = int(value)
                elif value.lower() in ["true", "false"]:
                    value = value.lower() == "true"
                    
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 certifi
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf==6.32.0
 certifi
 setuptools
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 certifi
 setuptools
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -836,27 +836,40 @@ var _ = Describe("API test", func() {
 			if runtime.GOOS != "linux" {
 				Skip("test supported only on linux")
 			}
+			embeddingModel := openai.AdaEmbeddingV2
 			resp, err := client.CreateEmbeddings(
 				context.Background(),
 				openai.EmbeddingRequest{
-					Model: openai.AdaEmbeddingV2,
+					Model: embeddingModel,
 					Input: []string{"sun", "cat"},
 				},
 			)
 			Expect(err).ToNot(HaveOccurred(), err)
-			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 2048))
-			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 2048))
+			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 4096))
+			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 4096))

 			sunEmbedding := resp.Data[0].Embedding
 			resp2, err := client.CreateEmbeddings(
 				context.Background(),
 				openai.EmbeddingRequest{
-					Model: openai.AdaEmbeddingV2,
+					Model: embeddingModel,
 					Input: []string{"sun"},
 				},
 			)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(resp2.Data[0].Embedding).To(Equal(sunEmbedding))
+			Expect(resp2.Data[0].Embedding).ToNot(Equal(resp.Data[1].Embedding))
+
+			resp3, err := client.CreateEmbeddings(
+				context.Background(),
+				openai.EmbeddingRequest{
+					Model: embeddingModel,
+					Input: []string{"cat"},
+				},
+			)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(resp3.Data[0].Embedding).To(Equal(resp.Data[1].Embedding))
+			Expect(resp3.Data[0].Embedding).ToNot(Equal(sunEmbedding))
 		})

 		Context("External gRPC calls", func() {
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -398,9 +398,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				}

 				finishReason := "stop"
-				if toolsCalled {
+				if toolsCalled && len(input.Tools) > 0 {
 					finishReason = "tool_calls"
-				} else if toolsCalled && len(input.Tools) == 0 {
+				} else if toolsCalled {
 					finishReason = "function_call"
 				}

@@ -443,11 +443,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				log.Debug().Msgf("Text content to return: %s", textContentToReturn)
 				noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0

-				finishReason := "stop"
-				if len(input.Tools) > 0 {
-					finishReason = "tool_calls"
-				}
-
 				switch {
 				case noActionsToRun:
 					result, err := handleQuestion(config, cl, input, ml, startupOptions, results, s, predInput)
@@ -457,11 +452,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					}

 					*c = append(*c, schema.Choice{
-						FinishReason: finishReason,
+						FinishReason: "stop",
 						Message:      &schema.Message{Role: "assistant", Content: &result}})
 				default:
 					toolChoice := schema.Choice{
-						FinishReason: finishReason,
+						FinishReason: "tool_calls",
 						Message: &schema.Message{
 							Role: "assistant",
 						},
--- a/core/http/http_suite_test.go
+++ b/core/http/http_suite_test.go
@@ -9,5 +9,5 @@ import (

 func TestLocalAI(t *testing.T) {
 	RegisterFailHandler(Fail)
-	RunSpecs(t, "LocalAI test suite")
+	RunSpecs(t, "LocalAI HTTP test suite")
 }
--- a/docs/content/docs/getting-started/quickstart.md
+++ b/docs/content/docs/getting-started/quickstart.md
@@ -182,7 +182,7 @@ MODEL_NAME=gemma-3-12b-it docker compose up

 # NVIDIA GPU setup with custom multimodal and image models
 MODEL_NAME=gemma-3-12b-it \
-MULTIMODAL_MODEL=minicpm-v-2_6 \
+MULTIMODAL_MODEL=minicpm-v-4_5 \
 IMAGE_MODEL=flux.1-dev-ggml \
 docker compose -f docker-compose.nvidia.yaml up
 ```
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v3.5.0"
+  "version": "v3.5.4"
 }
--- a/gallery/granite4.yaml
+++ b/gallery/granite4.yaml
@@ -0,0 +1,48 @@
+---
+name: "granite-3.2"
+
+config_file: |
+  backend: "llama-cpp"
+  mmap: true
+  template:
+    chat_message: |
+      <|start_of_role|>{{ .RoleName }}<|end_of_role|>
+      {{ if .FunctionCall -}}
+      <tool_call>
+      {{ else if eq .RoleName "tool" -}}
+      <tool_response>
+      {{ end -}}
+      {{ if .Content -}}
+      {{.Content }}
+      {{ end -}}
+      {{ if eq .RoleName "tool" -}}
+      </tool_response>
+      {{ end -}}
+      {{ if .FunctionCall -}}
+      {{toJson .FunctionCall}}
+      </tool_call>
+      {{ end -}}
+      <|end_of_text|>
+    function: |
+      <|start_of_role|>system<|end_of_role|>
+      You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
+
+      Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.
+      {{range .Functions}}
+      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+      {{end}}
+      For each function call return a json object with function name and arguments
+      {{.Input -}}
+      <|start_of_role|>assistant<|end_of_role|>
+    chat: |
+      {{.Input -}}
+      <|start_of_role|>assistant<|end_of_role|>
+    completion: |
+      {{.Input}}
+  context_size: 8192
+  f16: true
+  stopwords:
+  - '<|im_end|>'
+  - '<dummy32000>'
+  - '</s>'
+  - '<|end_of_text|>'
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,94 @@
 ---
+- &granite4
+  url: "github:mudler/LocalAI/gallery/granite4.yaml@master"
+  name: "ibm-granite_granite-4.0-h-small"
+  license: apache-2.0
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/639bcaa2445b133a4e942436/CEW-OjXkRkDNmTxSu8Egh.png
+  tags:
+    - gguf
+    - GPU
+    - CPU
+    - text-to-text
+  urls:
+    - https://huggingface.co/ibm-granite/granite-4.0-h-small
+    - https://huggingface.co/bartowski/ibm-granite_granite-4.0-h-small-GGUF
+  description: |
+      Granite-4.0-H-Small is a 32B parameter long-context instruct model finetuned from Granite-4.0-H-Small-Base using a combination of open source instruction datasets with permissive license and internally collected synthetic datasets. This model is developed using a diverse set of techniques with a structured chat format, including supervised finetuning, model alignment using reinforcement learning, and model merging. Granite 4.0 instruct models feature improved instruction following (IF) and tool-calling capabilities, making them more effective in enterprise applications.
+  overrides:
+    parameters:
+      model: ibm-granite_granite-4.0-h-small-Q4_K_M.gguf
+  files:
+    - filename: ibm-granite_granite-4.0-h-small-Q4_K_M.gguf
+      sha256: c59ce76239bd5794acdbdf88616dfc296247f4e78792a9678d4b3e24966ead69
+      uri: huggingface://bartowski/ibm-granite_granite-4.0-h-small-GGUF/ibm-granite_granite-4.0-h-small-Q4_K_M.gguf
+- !!merge <<: *granite4
+  name: "ibm-granite_granite-4.0-h-tiny"
+  urls:
+    - https://huggingface.co/ibm-granite/granite-4.0-h-tiny
+    - https://huggingface.co/bartowski/ibm-granite_granite-4.0-h-tiny-GGUF
+  description: |
+     Granite-4.0-H-Tiny is a 7B parameter long-context instruct model finetuned from Granite-4.0-H-Tiny-Base using a combination of open source instruction datasets with permissive license and internally collected synthetic datasets. This model is developed using a diverse set of techniques with a structured chat format, including supervised finetuning, model alignment using reinforcement learning, and model merging. Granite 4.0 instruct models feature improved instruction following (IF) and tool-calling capabilities, making them more effective in enterprise applications.
+  overrides:
+    parameters:
+      model: ibm-granite_granite-4.0-h-tiny-Q4_K_M.gguf
+  files:
+    - filename: ibm-granite_granite-4.0-h-tiny-Q4_K_M.gguf
+      sha256: 33a689fe7f35b14ebab3ae599b65aaa3ed8548c393373b1b0eebee36c653146f
+      uri: huggingface://bartowski/ibm-granite_granite-4.0-h-tiny-GGUF/ibm-granite_granite-4.0-h-tiny-Q4_K_M.gguf
+- !!merge <<: *granite4
+  name: "ibm-granite_granite-4.0-h-micro"
+  urls:
+    - https://huggingface.co/ibm-granite/granite-4.0-h-micro
+    - https://huggingface.co/bartowski/ibm-granite_granite-4.0-h-micro-GGUF
+  description: |
+    Granite-4.0-H-Micro is a 3B parameter long-context instruct model finetuned from Granite-4.0-H-Micro-Base using a combination of open source instruction datasets with permissive license and internally collected synthetic datasets. This model is developed using a diverse set of techniques with a structured chat format, including supervised finetuning, model alignment using reinforcement learning, and model merging. Granite 4.0 instruct models feature improved instruction following (IF) and tool-calling capabilities, making them more effective in enterprise applications.
+  overrides:
+    parameters:
+      model: ibm-granite_granite-4.0-h-micro-Q4_K_M.gguf
+  files:
+    - filename: ibm-granite_granite-4.0-h-micro-Q4_K_M.gguf
+      sha256: 48376d61449687a56b3811a418d92cc0e8e77b4d96ec13eb6c9d9503968c9f20
+      uri: huggingface://bartowski/ibm-granite_granite-4.0-h-micro-GGUF/ibm-granite_granite-4.0-h-micro-Q4_K_M.gguf
+- !!merge <<: *granite4
+  name: "ibm-granite_granite-4.0-micro"
+  urls:
+    - https://huggingface.co/ibm-granite/granite-4.0-micro
+    - https://huggingface.co/bartowski/ibm-granite_granite-4.0-micro-GGUF
+  description: |
+    Granite-4.0-Micro is a 3B parameter long-context instruct model finetuned from Granite-4.0-Micro-Base using a combination of open source instruction datasets with permissive license and internally collected synthetic datasets. This model is developed using a diverse set of techniques with a structured chat format, including supervised finetuning, model alignment using reinforcement learning, and model merging. Granite 4.0 instruct models feature improved instruction following (IF) and tool-calling capabilities, making them more effective in enterprise applications.
+  overrides:
+    parameters:
+      model: ibm-granite_granite-4.0-micro-Q4_K_M.gguf
+  files:
+    - filename: ibm-granite_granite-4.0-micro-Q4_K_M.gguf
+      sha256: bd9d7b4795b9dc44e3e81aeae93bb5d8e6b891b7e823be5bf9910ed3ac060baf
+      uri: huggingface://bartowski/ibm-granite_granite-4.0-micro-GGUF/ibm-granite_granite-4.0-micro-Q4_K_M.gguf
+- &ernie
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  name: "baidu_ernie-4.5-21b-a3b-thinking"
+  license: apache-2.0
+  tags:
+    - gguf
+    - GPU
+    - CPU
+    - text-to-text
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/64f187a2cc1c03340ac30498/TYYUxK8xD1AxExFMWqbZD.png
+  urls:
+    - https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking
+    - https://huggingface.co/bartowski/baidu_ERNIE-4.5-21B-A3B-Thinking-GGUF
+  description: |
+    Over the past three months, we have continued to scale the thinking capability of ERNIE-4.5-21B-A3B, improving both the quality and depth of reasoning, thereby advancing the competitiveness of ERNIE lightweight models in complex reasoning tasks. We are pleased to introduce ERNIE-4.5-21B-A3B-Thinking, featuring the following key enhancements:
+    Significantly improved performance on reasoning tasks, including logical reasoning, mathematics, science, coding, text generation, and academic benchmarks that typically require human expertise.
+    Efficient tool usage capabilities.
+    Enhanced 128K long-context understanding capabilities.
+    Note: This version has an increased thinking length. We strongly recommend its use in highly complex reasoning tasks. ERNIE-4.5-21B-A3B-Thinking is a text MoE post-trained model, with 21B total parameters and 3B activated parameters for each token.
+  overrides:
+    parameters:
+      model: baidu_ERNIE-4.5-21B-A3B-Thinking-Q4_K_M.gguf
+  files:
+    - filename: baidu_ERNIE-4.5-21B-A3B-Thinking-Q4_K_M.gguf
+      sha256: f309f225c413324c585e74ce28c55e76dec25340156374551d39707fc2966840
+      uri: huggingface://bartowski/baidu_ERNIE-4.5-21B-A3B-Thinking-GGUF/baidu_ERNIE-4.5-21B-A3B-Thinking-Q4_K_M.gguf
 - &mimo
  license: mit
  tags:
@@ -309,7 +399,7 @@
  url: "github:mudler/LocalAI/gallery/qwen-image.yaml@master"
  urls:
    - https://huggingface.co/Qwen/Qwen-Image-Edit
-  icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_logo.png
+  icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png
  license: apache-2.0
  tags:
    - qwen-image
@@ -324,6 +414,26 @@
      cuda: true
      pipeline_type: QwenImageEditPipeline
      enable_parameters: num_inference_steps,image
+- !!merge <<: *qwenimage
+  name: "qwen-image-edit-2509"
+  url: "github:mudler/LocalAI/gallery/qwen-image.yaml@master"
+  urls:
+    - https://huggingface.co/Qwen/Qwen-Image-Edit-2509
+  icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png
+  license: apache-2.0
+  tags:
+    - qwen-image
+    - gpu
+    - image-to-image
+  description: |
+    Qwen-Image-Edit is a model for image editing, which is based on Qwen-Image.
+  overrides:
+    parameters:
+      model: Qwen/Qwen-Image-Edit-2509
+    diffusers:
+      cuda: true
+      pipeline_type: QwenImageEditPipeline
+      enable_parameters: num_inference_steps,image
 - &gptoss
  name: "gpt-oss-20b"
  url: "github:mudler/LocalAI/gallery/harmony.yaml@master"
@@ -2544,6 +2654,107 @@
    - filename: minicpm-v-4_5-mmproj-f16.gguf
      uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/mmproj-model-f16.gguf
      sha256: 7a7225a32e8d453aaa3d22d8c579b5bf833c253f784cdb05c99c9a76fd616df8
+- !!merge <<: *qwen3
+  name: "aquif-ai_aquif-3.5-8b-think"
+  urls:
+    - https://huggingface.co/aquif-ai/aquif-3.5-8B-Think
+    - https://huggingface.co/bartowski/aquif-ai_aquif-3.5-8B-Think-GGUF
+  description: |
+    The aquif-3.5 series is the successor to aquif-3, featuring a simplified naming scheme, expanded Mixture of Experts (MoE) options, and across-the-board performance improvements. This release streamlines model selection while delivering enhanced capabilities across reasoning, multilingual support, and general intelligence tasks.
+    An experimental small-scale Mixture of Experts model designed for multilingual applications with minimal computational overhead. Despite its compact active parameter count, it demonstrates competitive performance against larger dense models.
+  overrides:
+    parameters:
+      model: aquif-ai_aquif-3.5-8B-Think-Q4_K_M.gguf
+  files:
+    - filename: aquif-ai_aquif-3.5-8B-Think-Q4_K_M.gguf
+      sha256: 9e49b9c840de23bb3eb181ba7a102706c120b3e3d006983c3f14ebae307ff02e
+      uri: huggingface://bartowski/aquif-ai_aquif-3.5-8B-Think-GGUF/aquif-ai_aquif-3.5-8B-Think-Q4_K_M.gguf
+- !!merge <<: *qwen3
+  name: "qwen3-stargate-sg1-uncensored-abliterated-8b-i1"
+  icon: https://huggingface.co/DavidAU/Qwen3-Stargate-SG1-Uncensored-Abliterated-8B/resolve/main/sg1.jpg
+  urls:
+    - https://huggingface.co/DavidAU/Qwen3-Stargate-SG1-Uncensored-Abliterated-8B
+    - https://huggingface.co/mradermacher/Qwen3-Stargate-SG1-Uncensored-Abliterated-8B-i1-GGUF
+  description: |
+    This repo contains the full precision source code, in "safe tensors" format to generate GGUFs, GPTQ, EXL2, AWQ, HQQ and other formats. The source code can also be used directly.
+
+    This model is specifically for SG1 (Stargate Series), science fiction, story generation (all genres) but also does coding and general tasks too.
+
+    This model can also be used for Role play.
+
+    This model will produce uncensored content (see notes below).
+
+    Fine tune (6 epochs, using Unsloth for Win 11) on an inhouse generated dataset to simulate / explore the Stargate SG1 Universe.
+
+    This version has the "canon" of all 10 seasons of SG1.
+
+    Model also contains, but not trained, on content from Stargate Atlantis, and Universe.
+
+    Fine tune process adds knowledge to the model, and alter all aspects of its operations.
+
+    Float32 (32 bit precision) was used to further increase the model's quality.
+
+    This model is based on "Goekdeniz-Guelmez/Josiefied-Qwen3-8B-abliterated-v1".
+
+    Example generations at the bottom of this page.
+
+    This is a Stargate (SG1) fine tune (1,331,953,664 of 9,522,689,024 (13.99% trained)), SIX epochs on this model.
+    As this is an instruct model, it will also benefit from a detailed system prompt too.
+  overrides:
+    parameters:
+      model: Qwen3-Stargate-SG1-Uncensored-Abliterated-8B.i1-Q4_K_M.gguf
+  files:
+    - filename: Qwen3-Stargate-SG1-Uncensored-Abliterated-8B.i1-Q4_K_M.gguf
+      sha256: 31ec697ccebbd7928c49714b8a0ec8be747be0f7c1ad71627967d2f8fe376990
+      uri: huggingface://mradermacher/Qwen3-Stargate-SG1-Uncensored-Abliterated-8B-i1-GGUF/Qwen3-Stargate-SG1-Uncensored-Abliterated-8B.i1-Q4_K_M.gguf
+- !!merge <<: *qwen3
+  url: "github:mudler/LocalAI/gallery/qwen3-deepresearch.yaml@master"
+  name: "alibaba-nlp_tongyi-deepresearch-30b-a3b"
+  urls:
+    - https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B
+    - https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF
+  description: |
+    We present Tongyi DeepResearch, an agentic large language model featuring 30 billion total parameters, with only 3 billion activated per token. Developed by Tongyi Lab, the model is specifically designed for long-horizon, deep information-seeking tasks. Tongyi-DeepResearch demonstrates state-of-the-art performance across a range of agentic search benchmarks, including Humanity's Last Exam, BrowserComp, BrowserComp-ZH, WebWalkerQA, GAIA, xbench-DeepSearch and FRAMES.
+  overrides:
+    parameters:
+      model: Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf
+  files:
+    - filename: Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf
+      sha256: 1afefb3b369ea2de191f24fe8ea22cbbb7b412357902f27bd81d693dde35c2d9
+      uri: huggingface://bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf
+- !!merge <<: *qwen3
+  name: "impish_qwen_14b-1m"
+  icon: https://huggingface.co/SicariusSicariiStuff/Impish_QWEN_14B-1M/resolve/main/Images/Impish_Qwen_14B.png
+  urls:
+    - https://huggingface.co/SicariusSicariiStuff/Impish_QWEN_14B-1M
+    - https://huggingface.co/mradermacher/Impish_QWEN_14B-1M-GGUF
+  description: |
+    Supreme context One million tokens to play with.
+    Strong Roleplay internet RP format lovers will appriciate it, medium size paragraphs.
+    Qwen smarts built-in, but naughty and playful Maybe it's even too naughty.
+    VERY compliant with low censorship.
+    VERY high IFeval for a 14B RP model: 78.68.
+  overrides:
+    parameters:
+      model: Impish_QWEN_14B-1M.Q4_K_M.gguf
+  files:
+    - filename: Impish_QWEN_14B-1M.Q4_K_M.gguf
+      sha256: d326f2b8f05814ea3943c82498f0cd3cde64859cf03f532855c87fb94b0da79e
+      uri: huggingface://mradermacher/Impish_QWEN_14B-1M-GGUF/Impish_QWEN_14B-1M.Q4_K_M.gguf
+- !!merge <<: *qwen3
+  name: "aquif-3.5-a4b-think"
+  urls:
+    - https://huggingface.co/aquif-ai/aquif-3.5-A4B-Think
+    - https://huggingface.co/QuantFactory/aquif-3.5-A4B-Think-GGUF
+  description: |
+    The aquif-3.5 series is the successor to aquif-3, featuring a simplified naming scheme, expanded Mixture of Experts (MoE) options, and across-the-board performance improvements. This release streamlines model selection while delivering enhanced capabilities across reasoning, multilingual support, and general intelligence tasks.
+  overrides:
+    parameters:
+      model: aquif-3.5-A4B-Think.Q4_K_M.gguf
+  files:
+    - filename: aquif-3.5-A4B-Think.Q4_K_M.gguf
+      sha256: 1650b72ae1acf12b45a702f2ff5f47205552e494f0d910e81cbe40dfba55a6b9
+      uri: huggingface://QuantFactory/aquif-3.5-A4B-Think-GGUF/aquif-3.5-A4B-Think.Q4_K_M.gguf
 - &gemma3
  url: "github:mudler/LocalAI/gallery/gemma.yaml@master"
  name: "gemma-3-27b-it"
@@ -7485,6 +7696,40 @@
    - filename: Qwentile2.5-32B-Instruct-Q4_K_M.gguf
      sha256: e476d6e3c15c78fc3f986d7ae8fa35c16116843827f2e6243c05767cef2f3615
      uri: huggingface://bartowski/Qwentile2.5-32B-Instruct-GGUF/Qwentile2.5-32B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "websailor-32b"
+  urls:
+    - https://huggingface.co/Alibaba-NLP/WebSailor-32B
+    - https://huggingface.co/mradermacher/WebSailor-32B-GGUF
+  description: |
+    WebSailor is a complete post-training methodology designed to teach LLM agents sophisticated reasoning for complex web navigation and information-seeking tasks. It addresses the challenge of extreme uncertainty in vast information landscapes, a capability where previous open-source models lagged behind proprietary systems.
+    We classify information-seeking tasks into three difficulty levels, where Level 3 represents problems with both high uncertainty and a complex, non-linear path to a solution. To generate these challenging tasks, we introduce SailorFog-QA, a novel data synthesis pipeline that constructs intricate knowledge graphs and then applies information obfuscation. This process creates questions with high initial uncertainty that demand creative exploration and transcend simple, structured reasoning patterns.
+    Our training process begins by generating expert trajectories and then reconstructing the reasoning to create concise, action-oriented supervision signals, avoiding the stylistic and verbosity issues of teacher models. The agent is first given a "cold start" using rejection sampling fine-tuning (RFT) on a small set of high-quality examples to establish a baseline capability. This is followed by an efficient agentic reinforcement learning stage using our Duplicating Sampling Policy Optimization (DUPO) algorithm, which refines the agent's exploratory strategies.
+    WebSailor establishes a new state-of-the-art for open-source agents, achieving outstanding results on difficult benchmarks like BrowseComp-en and BrowseComp-zh. Notably, our smaller models like WebSailor-7B outperform agents built on much larger backbones, highlighting the efficacy of our training paradigm. Ultimately, WebSailor closes the performance gap to proprietary systems, achieving results on par with agents like Doubao-Search.
+  overrides:
+    parameters:
+      model: WebSailor-32B.Q4_K_M.gguf
+  files:
+    - filename: WebSailor-32B.Q4_K_M.gguf
+      sha256: 60cea732b8314cedf1807530857b4ebd9f6c41431b3223384eb7f94fbff7b5bc
+      uri: huggingface://mradermacher/WebSailor-32B-GGUF/WebSailor-32B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "websailor-7b"
+  urls:
+    - https://huggingface.co/Alibaba-NLP/WebSailor-7B
+    - https://huggingface.co/mradermacher/WebSailor-7B-GGUF
+  description: |
+    WebSailor is a complete post-training methodology designed to teach LLM agents sophisticated reasoning for complex web navigation and information-seeking tasks. It addresses the challenge of extreme uncertainty in vast information landscapes, a capability where previous open-source models lagged behind proprietary systems.
+    We classify information-seeking tasks into three difficulty levels, where Level 3 represents problems with both high uncertainty and a complex, non-linear path to a solution. To generate these challenging tasks, we introduce SailorFog-QA, a novel data synthesis pipeline that constructs intricate knowledge graphs and then applies information obfuscation. This process creates questions with high initial uncertainty that demand creative exploration and transcend simple, structured reasoning patterns.
+    Our training process begins by generating expert trajectories and then reconstructing the reasoning to create concise, action-oriented supervision signals, avoiding the stylistic and verbosity issues of teacher models. The agent is first given a "cold start" using rejection sampling fine-tuning (RFT) on a small set of high-quality examples to establish a baseline capability. This is followed by an efficient agentic reinforcement learning stage using our Duplicating Sampling Policy Optimization (DUPO) algorithm, which refines the agent's exploratory strategies.
+    WebSailor establishes a new state-of-the-art for open-source agents, achieving outstanding results on difficult benchmarks like BrowseComp-en and BrowseComp-zh. Notably, our smaller models like WebSailor-7B outperform agents built on much larger backbones, highlighting the efficacy of our training paradigm. Ultimately, WebSailor closes the performance gap to proprietary systems, achieving results on par with agents like Doubao-Search.
+  overrides:
+    parameters:
+      model: WebSailor-7B.Q4_K_M.gguf
+  files:
+    - filename: WebSailor-7B.Q4_K_M.gguf
+      sha256: 6ede884af5d82176606c3af19a5cc90da6fdf81a520f54284084f5e012217a56
+      uri: huggingface://mradermacher/WebSailor-7B-GGUF/WebSailor-7B.Q4_K_M.gguf
 - &archfunct
  license: apache-2.0
  tags:
@@ -9884,6 +10129,119 @@
    - filename: baichuan-inc_Baichuan-M2-32B-Q4_K_M.gguf
      sha256: 51907419518e6f79c28f75e4097518e54c2efecd85cb4c714334395fa2d591c2
      uri: huggingface://bartowski/baichuan-inc_Baichuan-M2-32B-GGUF/baichuan-inc_Baichuan-M2-32B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "k2-think-i1"
+  icon: https://huggingface.co/LLM360/K2-Think/resolve/main/banner.png
+  urls:
+    - https://huggingface.co/LLM360/K2-Think
+    - https://huggingface.co/mradermacher/K2-Think-i1-GGUF
+  description: |
+    K2-Think is a 32 billion parameter open-weights general reasoning model with strong performance in competitive mathematical problem solving.
+  overrides:
+    parameters:
+      model: K2-Think.i1-Q4_K_M.gguf
+  files:
+    - filename: K2-Think.i1-Q4_K_M.gguf
+      sha256: 510fad18b0cf58059437338c1b5b982996ef89456a8d88da52eb3d50fe78b9fd
+      uri: huggingface://mradermacher/K2-Think-i1-GGUF/K2-Think.i1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "holo1.5-72b"
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/677d3f355f847864bb644112/OQyAJ33sssiTDIQEQ7oH_.png
+  urls:
+    - https://huggingface.co/Hcompany/Holo1.5-72B
+    - https://huggingface.co/mradermacher/Holo1.5-72B-GGUF
+  description: |
+    Computer Use (CU) agents are AI systems that can interact with real applications—web, desktop, and mobile—on behalf of a user. They can navigate interfaces, manipulate elements, and answer questions about content, enabling powerful automation and productivity tools. CU agents are becoming increasingly important as they allow humans to delegate complex digital tasks safely and efficiently.
+    The Holo1.5 series provides state-of-the-art foundational models for building such agents. Holo1.5 models excel at user interface (UI) localization and UI-based question answering (QA) across web, computer, and mobile environments, with strong performance on benchmarks including Screenspot-V2, Screenspot-Pro, GroundUI-Web, Showdown, and our newly introduced WebClick.
+  overrides:
+    mmproj: Holo1.5-72B.mmproj-Q8_0.gguf
+    parameters:
+      model: Holo1.5-72B.Q4_K_M.gguf
+  files:
+    - filename: Holo1.5-72B.Q4_K_M.gguf
+      sha256: 3404347c245fefa352a3dc16134b5870f594ab8bff11e50582205b5538201a23
+      uri: huggingface://mradermacher/Holo1.5-72B-GGUF/Holo1.5-72B.Q4_K_M.gguf
+    - filename: Holo1.5-72B.mmproj-Q8_0.gguf
+      sha256: f172cffc96a00d4f885eecffbc798912d37105f4191ba16a9947a5776b0f8a02
+      uri: huggingface://mradermacher/Holo1.5-72B-GGUF/Holo1.5-72B.mmproj-Q8_0.gguf
+- !!merge <<: *qwen25
+  name: "holo1.5-7b"
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/677d3f355f847864bb644112/OQyAJ33sssiTDIQEQ7oH_.png
+  urls:
+    - https://huggingface.co/Hcompany/Holo1.5-7B
+    - https://huggingface.co/mradermacher/Holo1.5-7B-GGUF
+  description: |
+    Computer Use (CU) agents are AI systems that can interact with real applications—web, desktop, and mobile—on behalf of a user. They can navigate interfaces, manipulate elements, and answer questions about content, enabling powerful automation and productivity tools. CU agents are becoming increasingly important as they allow humans to delegate complex digital tasks safely and efficiently.
+    The Holo1.5 series provides state-of-the-art foundational models for building such agents. Holo1.5 models excel at user interface (UI) localization and UI-based question answering (QA) across web, computer, and mobile environments, with strong performance on benchmarks including Screenspot-V2, Screenspot-Pro, GroundUI-Web, Showdown, and our newly introduced WebClick.
+  overrides:
+    mmproj: Holo1.5-7B.mmproj-Q8_0.gguf
+    parameters:
+      model: Holo1.5-7B.Q4_K_M.gguf
+  files:
+    - filename: Holo1.5-7B.Q4_K_M.gguf
+      sha256: 37d1c060b73b783ffdab8d70fa47a6cff46cd34b1cf44b5bfbf4f20ff99eacdd
+      uri: huggingface://mradermacher/Holo1.5-7B-GGUF/Holo1.5-7B.Q4_K_M.gguf
+    - filename: Holo1.5-7B.mmproj-Q8_0.gguf
+      sha256: a9bad2d3d9241251b8753d9be4ea737c03197077d96153c1365a62db709489f6
+      uri: huggingface://mradermacher/Holo1.5-7B-GGUF/Holo1.5-7B.mmproj-Q8_0.gguf
+- !!merge <<: *qwen25
+  name: "holo1.5-3b"
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/677d3f355f847864bb644112/OQyAJ33sssiTDIQEQ7oH_.png
+  urls:
+    - https://huggingface.co/Hcompany/Holo1.5-3B
+    - https://huggingface.co/mradermacher/Holo1.5-3B-GGUF
+  description: |
+    Computer Use (CU) agents are AI systems that can interact with real applications—web, desktop, and mobile—on behalf of a user. They can navigate interfaces, manipulate elements, and answer questions about content, enabling powerful automation and productivity tools. CU agents are becoming increasingly important as they allow humans to delegate complex digital tasks safely and efficiently.
+    The Holo1.5 series provides state-of-the-art foundational models for building such agents. Holo1.5 models excel at user interface (UI) localization and UI-based question answering (QA) across web, computer, and mobile environments, with strong performance on benchmarks including Screenspot-V2, Screenspot-Pro, GroundUI-Web, Showdown, and our newly introduced WebClick.
+  overrides:
+    mmproj: Holo1.5-3B.mmproj-Q8_0.gguf
+    parameters:
+      model: Holo1.5-3B.Q4_K_M.gguf
+  files:
+    - filename: Holo1.5-3B.Q4_K_M.gguf
+      sha256: 5efb1318d439fe1f71e38825a17203c48ced7de4a5d0796427c8c638e817622a
+      uri: huggingface://mradermacher/Holo1.5-3B-GGUF/Holo1.5-3B.Q4_K_M.gguf
+    - filename: Holo1.5-3B.mmproj-Q8_0.gguf
+      sha256: fb5cc798b386a4b680c306f061457cb16cc627c7d9ed401d660b8b940463142b
+      uri: huggingface://mradermacher/Holo1.5-3B-GGUF/Holo1.5-3B.mmproj-Q8_0.gguf
+- !!merge <<: *qwen25
+  name: "webwatcher-7b"
+  icon: https://huggingface.co/Alibaba-NLP/WebWatcher-7B/resolve/main/assets/webwatcher_logo.png
+  urls:
+    - https://huggingface.co/Alibaba-NLP/WebWatcher-7B
+    - https://huggingface.co/mradermacher/WebWatcher-7B-GGUF
+  description: |
+      WebWatcher is a multimodal agent for deep research that possesses enhanced visual-language reasoning capabilities. Our work presents a unified framework that combines complex vision-language reasoning with multi-tool interaction.
+  overrides:
+    mmproj: WebWatcher-7B.mmproj-Q8_0.gguf
+    parameters:
+      model: WebWatcher-7B.Q4_K_M.gguf
+  files:
+    - filename: WebWatcher-7B.Q4_K_M.gguf
+      sha256: 300c76a51de59552f997ee7ee78ec519620931dea15c655111633b96de1a47f2
+      uri: huggingface://mradermacher/WebWatcher-7B-GGUF/WebWatcher-7B.Q4_K_M.gguf
+    - filename: WebWatcher-7B.mmproj-Q8_0.gguf
+      sha256: 841dc1bcc4f69ca864518d2c9a9a37b1815169d9bd061b054e091061124e4e62
+      uri: huggingface://mradermacher/WebWatcher-7B-GGUF/WebWatcher-7B.mmproj-Q8_0.gguf
+- !!merge <<: *qwen25
+  name: "webwatcher-32b"
+  icon: https://huggingface.co/Alibaba-NLP/WebWatcher-32B/resolve/main/assets/webwatcher_logo.png
+  urls:
+    - https://huggingface.co/Alibaba-NLP/WebWatcher-32B
+    - https://huggingface.co/mradermacher/WebWatcher-32B-GGUF
+  description: |
+      WebWatcher is a multimodal agent for deep research that possesses enhanced visual-language reasoning capabilities. Our work presents a unified framework that combines complex vision-language reasoning with multi-tool interaction.
+  overrides:
+    mmproj: WebWatcher-32B.mmproj-Q8_0.gguf
+    parameters:
+      model: WebWatcher-32B.Q4_K_M.gguf
+  files:
+    - filename: WebWatcher-32B.Q4_K_M.gguf
+      sha256: 6cd51d97b9451759a4ce4ec0c2048b36ff99fd9f83bb32cd9f06af6c5438c69b
+      uri: huggingface://mradermacher/WebWatcher-32B-GGUF/WebWatcher-32B.Q4_K_M.gguf
+    - filename: WebWatcher-32B.mmproj-Q8_0.gguf
+      sha256: e8815515f71a959465cc62e08e0ef45d7d8592215139b34efece848552cb2327
+      uri: huggingface://mradermacher/WebWatcher-32B-GGUF/WebWatcher-32B.mmproj-Q8_0.gguf
 - &llama31
  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
  icon: https://avatars.githubusercontent.com/u/153379578
@@ -14934,6 +15292,27 @@
    - filename: Impish_Longtail_12B-Q4_K_M.gguf
      sha256: 2cf0cacb65d71cfc5b4255f3273ad245bbcb11956a0f9e3aaa0e739df57c90df
      uri: huggingface://SicariusSicariiStuff/Impish_Longtail_12B_GGUF/Impish_Longtail_12B-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "mistralai_magistral-small-2509"
+  urls:
+    - https://huggingface.co/mistralai/Magistral-Small-2509
+    - https://huggingface.co/bartowski/mistralai_Magistral-Small-2509-GGUF
+  description: |
+    Magistral Small 1.2
+    Building upon Mistral Small 3.2 (2506), with added reasoning capabilities, undergoing SFT from Magistral Medium traces and RL on top, it's a small, efficient reasoning model with 24B parameters.
+
+    Magistral Small can be deployed locally, fitting within a single RTX 4090 or a 32GB RAM MacBook once quantized.
+
+    Learn more about Magistral in our blog post.
+
+    The model was presented in the paper Magistral.
+  overrides:
+    parameters:
+      model: mistralai_Magistral-Small-2509-Q4_K_M.gguf
+  files:
+    - filename: mistralai_Magistral-Small-2509-Q4_K_M.gguf
+      sha256: 1d638bc931de30d29fc73ad439206ff185f76666a096e7ad723866a20f78728d
+      uri: huggingface://bartowski/mistralai_Magistral-Small-2509-GGUF/mistralai_Magistral-Small-2509-Q4_K_M.gguf
 - &mudler
  url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
  name: "LocalAI-llama3-8b-function-call-v0.2"
@@ -20095,9 +20474,9 @@
    - https://huggingface.co/ggerganov/whisper.cpp
  overrides:
    parameters:
-      model: ggml-whisper-base.bin
+      model: ggml-base.bin
  files:
-    - filename: "ggml-whisper-base.bin"
+    - filename: "ggml-base.bin"
      sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
      uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
  description: |
@@ -20142,11 +20521,20 @@
  name: "whisper-large-q5_0"
  overrides:
    parameters:
-      model: ggml-large-q5_0.bin
+      model: ggml-large-v3-q5_0.bin
  files:
-    - filename: "ggml-large-q5_0.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-q5_0.bin"
-      sha256: 3a214837221e4530dbc1fe8d734f302af393eb30bd0ed046042ebf4baf70f6f2
+    - filename: "ggml-large-v3-q5_0.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-q5_0.bin"
+      sha256: d75795ecff3f83b5faa89d1900604ad8c780abd5739fae406de19f23ecd98ad1
+- !!merge <<: *whisper
+  name: "whisper-medium"
+  overrides:
+    parameters:
+      model: ggml-medium.bin
+  files:
+    - filename: "ggml-medium.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-medium.bin"
+      sha256: 6c14d5adee5f86394037b4e4e8b59f1673b6cee10e3cf0b11bbdbee79c156208
 - !!merge <<: *whisper
  name: "whisper-medium-q5_0"
  overrides:
@@ -20174,15 +20562,6 @@
    - filename: "ggml-small.bin"
      uri: "huggingface://ggerganov/whisper.cpp/ggml-small.bin"
      sha256: 1be3a9b2063867b937e64e2ec7483364a79917e157fa98c5d94b5c1fffea987b
- !!merge <<: *whisper
-  name: "whisper-small-en-tdrz"
-  overrides:
-    parameters:
-      model: ggml-small.en-tdrz.bin
-  files:
-    - filename: "ggml-small.bin"
-      uri: "huggingface://akashmjn/tinydiarize-whisper.cpp/ggml-small.en-tdrz.bin"
-      sha256: ceac3ec06d1d98ef71aec665283564631055fd6129b79d8e1be4f9cc33cc54b4
 - !!merge <<: *whisper
  name: "whisper-small-en-q5_1"
  overrides:
@@ -20255,6 +20634,51 @@
    - filename: "ggml-tiny.en-q8_0.bin"
      uri: "huggingface://ggerganov/whisper.cpp/ggml-tiny.en-q8_0.bin"
      sha256: 5bc2b3860aa151a4c6e7bb095e1fcce7cf12c7b020ca08dcec0c6d018bb7dd94
+- !!merge <<: *whisper
+  name: "whisper-large"
+  overrides:
+    parameters:
+      model: ggml-large-v3.bin
+  files:
+    - filename: "ggml-large-v3.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3.bin"
+      sha256: 64d182b440b98d5203c4f9bd541544d84c605196c4f7b845dfa11fb23594d1e2
+- !!merge <<: *whisper
+  name: "whisper-large-q5_0"
+  overrides:
+    parameters:
+      model: ggml-large-v3-q5_0.bin
+  files:
+    - filename: "ggml-large-v3-q5_0.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-q5_0.bin"
+      sha256: d75795ecff3f83b5faa89d1900604ad8c780abd5739fae406de19f23ecd98ad1
+- !!merge <<: *whisper
+  name: "whisper-large-turbo"
+  overrides:
+    parameters:
+      model: ggml-large-v3-turbo.bin
+  files:
+    - filename: "ggml-large-v3-turbo.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-turbo.bin"
+      sha256: 1fc70f774d38eb169993ac391eea357ef47c88757ef72ee5943879b7e8e2bc69
+- !!merge <<: *whisper
+  name: "whisper-large-turbo-q5_0"
+  overrides:
+    parameters:
+      model: ggml-large-v3-turbo-q5_0.bin
+  files:
+    - filename: "ggml-large-v3-turbo-q5_0.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-turbo-q5_0.bin"
+      sha256: 394221709cd5ad1f40c46e6031ca61bce88931e6e088c188294c6d5a55ffa7e2
+- !!merge <<: *whisper
+  name: "whisper-large-turbo-q8_0"
+  overrides:
+    parameters:
+      model: ggml-large-v3-turbo-q8_0.bin
+  files:
+    - filename: "ggml-large-v3-turbo-q8_0.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-turbo-q8_0.bin"
+      sha256: 317eb69c11673c9de1e1f0d459b253999804ec71ac4c23c17ecf5fbe24e259a1
 ## Bert embeddings (llama3.2 drop-in)
 - !!merge <<: *llama32
  name: "bert-embeddings"
--- a/gallery/qwen3-deepresearch.yaml
+++ b/gallery/qwen3-deepresearch.yaml
@@ -0,0 +1,45 @@
+---
+name: "qwen3"
+
+config_file: |
+  mmap: true
+  backend: "llama-cpp"
+  template:
+    chat_message: |
+      <|im_start|>{{if eq .RoleName "tool" }}user{{else}}{{ .RoleName }}{{end}}
+      {{ if eq .RoleName "tool" -}}
+      <tool_response>
+      {{ end -}}
+      {{ if .Content -}}
+      {{.Content }}
+      {{ end -}}
+      {{ if eq .RoleName "tool" -}}
+      </tool_response>
+      {{ end -}}
+      {{ if .FunctionCall -}}
+      <tool_call>
+      {{toJson .FunctionCall}}
+      </tool_call>
+      {{ end -}}<|im_end|>
+    function: |
+      <|im_start|>system
+      You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+      {{range .Functions}}
+      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+      {{end}}
+      For each function call return a json object with function name and arguments
+      <|im_end|>
+      {{.Input -}}
+      <|im_start|>assistant
+    chat: |
+      {{.Input -}}
+      <|im_start|>assistant
+    completion: |
+      {{.Input}}
+  context_size: 8192
+  f16: true
+  stopwords:
+  - '<|im_end|>'
+  - '<dummy32000>'
+  - '</s>'
+  - '<|endoftext|>'
--- a/pkg/downloader/huggingface.go
+++ b/pkg/downloader/huggingface.go
@@ -23,10 +23,10 @@ var ErrUnsafeFilesFound = errors.New("unsafe files found")

 func HuggingFaceScan(uri URI) (*HuggingFaceScanResult, error) {
 	cleanParts := strings.Split(uri.ResolveURL(), "/")
-	if len(cleanParts) <= 4 || cleanParts[2] != "huggingface.co" {
+	if len(cleanParts) <= 4 || cleanParts[2] != "huggingface.co" && cleanParts[2] != HF_ENDPOINT {
 		return nil, ErrNonHuggingFaceFile
 	}
-	results, err := http.Get(fmt.Sprintf("https://huggingface.co/api/models/%s/%s/scan", cleanParts[3], cleanParts[4]))
+	results, err := http.Get(fmt.Sprintf("%s/api/models/%s/%s/scan", HF_ENDPOINT, cleanParts[3], cleanParts[4]))
 	if err != nil {
 		return nil, err
 	}
--- a/pkg/downloader/uri.go
+++ b/pkg/downloader/uri.go
@@ -37,6 +37,17 @@ const (

 type URI string

+// HF_ENDPOINT is the HuggingFace endpoint, can be overridden by setting the HF_ENDPOINT environment variable.
+var HF_ENDPOINT string = loadConfig()
+
+func loadConfig() string {
+	HF_ENDPOINT := os.Getenv("HF_ENDPOINT")
+	if HF_ENDPOINT == "" {
+		HF_ENDPOINT = "https://huggingface.co"
+	}
+	return HF_ENDPOINT
+}
+
 func (uri URI) DownloadWithCallback(basePath string, f func(url string, i []byte) error) error {
 	return uri.DownloadWithAuthorizationAndCallback(basePath, "", f)
 }
@@ -213,7 +224,7 @@ func (s URI) ResolveURL() string {
 			filepath = strings.Split(filepath, "@")[0]
 		}

-		return fmt.Sprintf("https://huggingface.co/%s/%s/resolve/%s/%s", owner, repo, branch, filepath)
+		return fmt.Sprintf("%s/%s/%s/resolve/%s/%s", HF_ENDPOINT, owner, repo, branch, filepath)
 	}

 	return string(s)
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -95,6 +95,7 @@ var knownModelsNameSuffixToSkip []string = []string{
 	".DS_Store",
 	".",
 	".safetensors",
+	".bin",
 	".partial",
 	".tar.gz",
 }
--- a/tests/e2e-aio/e2e_test.go
+++ b/tests/e2e-aio/e2e_test.go
@@ -169,6 +169,30 @@ var _ = Describe("E2E test", func() {
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Data)).To(Equal(1), fmt.Sprint(resp))
 				Expect(resp.Data[0].Embedding).ToNot(BeEmpty())
+
+				resp2, err := client.CreateEmbeddings(context.TODO(),
+					openai.EmbeddingRequestStrings{
+						Input: []string{"cat"},
+						Model: openai.AdaEmbeddingV2,
+					},
+				)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp2.Data)).To(Equal(1), fmt.Sprint(resp))
+				Expect(resp2.Data[0].Embedding).ToNot(BeEmpty())
+				Expect(resp2.Data[0].Embedding).ToNot(Equal(resp.Data[0].Embedding))
+
+				resp3, err := client.CreateEmbeddings(context.TODO(),
+					openai.EmbeddingRequestStrings{
+						Input: []string{"doc", "cat"},
+						Model: openai.AdaEmbeddingV2,
+					},
+				)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp3.Data)).To(Equal(2), fmt.Sprint(resp))
+				Expect(resp3.Data[0].Embedding).ToNot(BeEmpty())
+				Expect(resp3.Data[0].Embedding).To(Equal(resp.Data[0].Embedding))
+				Expect(resp3.Data[1].Embedding).To(Equal(resp2.Data[0].Embedding))
+				Expect(resp3.Data[0].Embedding).ToNot(Equal(resp3.Data[1].Embedding))
 			})
 		})
 		Context("vision", func() {