WIP

2026-02-04 03:32:40 -05:00 · 2025-09-17 21:52:53 +02:00
36 changed files with 294 additions and 488 deletions
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -197,6 +197,18 @@ jobs:
            backend: "rerankers"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-rerankers'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "rerankers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -209,6 +221,18 @@ jobs:
            backend: "llama-cpp"
            dockerfile: "./backend/Dockerfile.llama-cpp"
            context: "./"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "llama-cpp"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -221,6 +245,18 @@ jobs:
            backend: "vllm"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-vllm'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "vllm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -233,6 +269,18 @@ jobs:
            backend: "transformers"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-transformers'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "transformers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -245,7 +293,19 @@ jobs:
            backend: "diffusers"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
-          # CUDA 12 additional backends
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-diffusers'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "diffusers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          # CUDA additional backends
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -258,6 +318,18 @@ jobs:
            backend: "kokoro"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-kokoro'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "kokoro"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -270,6 +342,18 @@ jobs:
            backend: "faster-whisper"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-faster-whisper'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "faster-whisper"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -282,6 +366,18 @@ jobs:
            backend: "coqui"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-coqui'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "coqui"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -294,6 +390,18 @@ jobs:
            backend: "bark"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-bark'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "bark"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -306,6 +414,18 @@ jobs:
            backend: "chatterbox"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-chatterbox'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "chatterbox"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          # hipblas builds
          - build-type: 'hipblas'
            cuda-major-version: ""
@@ -489,18 +609,6 @@ jobs:
            backend: "diffusers"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
-          - build-type: 'l4t'
-            cuda-major-version: "12"
-            cuda-minor-version: "0"
-            platforms: 'linux/arm64'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-nvidia-l4t-kokoro'
-            runs-on: 'ubuntu-24.04-arm'
-            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
-            skip-drivers: 'true'
-            backend: "kokoro"
-            dockerfile: "./backend/Dockerfile.python"
-            context: "./backend"
          # SYCL additional backends
          - build-type: 'intel'
            cuda-major-version: ""
@@ -637,6 +745,18 @@ jobs:
            backend: "stablediffusion-ggml"
            dockerfile: "./backend/Dockerfile.golang"
            context: "./"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-stablediffusion-ggml'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "stablediffusion-ggml"
+            dockerfile: "./backend/Dockerfile.golang"
+            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -722,6 +842,18 @@ jobs:
            backend: "whisper"
            dockerfile: "./backend/Dockerfile.golang"
            context: "./"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-12-whisper'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "whisper"
+            dockerfile: "./backend/Dockerfile.golang"
+            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -858,6 +990,18 @@ jobs:
            backend: "rfdetr"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-12-rfdetr'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "rfdetr"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -882,7 +1026,7 @@ jobs:
            backend: "rfdetr"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
-          - build-type: 'l4t'
+          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/arm64'
@@ -919,6 +1063,18 @@ jobs:
            backend: "exllama2"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-12-exllama2'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "exllama2"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -955,18 +1111,6 @@ jobs:
            backend: "exllama2"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
-          - build-type: 'l4t'
-            cuda-major-version: "12"
-            cuda-minor-version: "0"
-            platforms: 'linux/arm64'
-            skip-drivers: 'true'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-nvidia-l4t-arm64-chatterbox'
-            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
-            runs-on: 'ubuntu-24.04-arm'
-            backend: "chatterbox"
-            dockerfile: "./backend/Dockerfile.python"
-            context: "./backend"
          # runs out of space on the runner
          # - build-type: 'hipblas'
          #   cuda-major-version: ""
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -34,6 +34,15 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-gpu-nvidia-cuda-13'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -100,6 +100,17 @@ jobs:
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            aio: "-aio-gpu-nvidia-cuda-12"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            makeflags: "--jobs=4 --output-sync=target"
+            aio: "-aio-gpu-nvidia-cuda-13"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
--- a/.github/workflows/localaibot_automerge.yml
+++ b/.github/workflows/localaibot_automerge.yml
@@ -6,8 +6,7 @@ permissions:
  contents: write
  pull-requests: write
  packages: read
-  issues: write # for Homebrew/actions/post-comment
-  actions: write # to dispatch publish workflow
+
 jobs:
  dependabot:
    runs-on: ubuntu-latest
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.9
+        uses: securego/gosec@v2.22.8
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/10
+++ b/10
@@ -78,16 +78,6 @@ RUN <<EOT bash
    fi
 EOT

-# https://github.com/NVIDIA/Isaac-GR00T/issues/343
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
-        wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu2204-0.6.0_0.6.0-1_arm64.deb && \
-        dpkg -i cudss-local-tegra-repo-ubuntu2204-0.6.0_0.6.0-1_arm64.deb && \
-        cp /var/cudss-local-tegra-repo-ubuntu2204-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get -y install cudss
-    fi
-EOT
-
 # If we are building with clblas support, we need the libraries for the builds
 RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
--- a/3
+++ b/3
@@ -429,9 +429,6 @@ docker-build-kitten-tts:
 docker-save-kitten-tts: backend-images
 	docker save local-ai-backend:kitten-tts -o backend-images/kitten-tts.tar

-docker-save-chatterbox: backend-images
-	docker save local-ai-backend:chatterbox -o backend-images/chatterbox.tar
-
 docker-build-kokoro:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend

--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=d64c8104f090b27b1f99e8da5995ffcfa6b726e2
+LLAMA_VERSION?=8ff206097c2bf3ca1c7aa95f9d6db779fc7bdd68
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
@@ -14,7 +14,7 @@ CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF

 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF -DLLAMA_OPENSSL=OFF
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -231,7 +231,6 @@ static void params_parse(const backend::ModelOptions* request,
    params.cpuparams.n_threads = request->threads();
    params.n_gpu_layers = request->ngpulayers();
    params.n_batch = request->nbatch();
-    params.n_ubatch = request->nbatch(); // fixes issue with reranking models being limited to 512 tokens (the default n_ubatch size); allows for setting the maximum input amount of tokens thereby avoiding this error "input is too large to process. increase the physical batch size"
    // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
    //params.n_parallel = 1;
    const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
@@ -802,6 +801,11 @@ public:
            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"documents\" must be a non-empty string array");
        }

+        // Tokenize the query
+        auto tokenized_query = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, request->query(), /* add_special */ false, true);
+        if (tokenized_query.size() != 1) {
+            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"query\" must contain only a single prompt");
+        }
        // Create and queue the task
        json responses = json::array();
        bool error = false;
@@ -813,9 +817,10 @@ public:
                documents.push_back(request->documents(i));
            }
            
-            tasks.reserve(documents.size());
-            for (size_t i = 0; i < documents.size(); i++) {
-                auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, request->query(), documents[i]);
+            auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
+            tasks.reserve(tokenized_docs.size());
+            for (size_t i = 0; i < tokenized_docs.size(); i++) {
+                auto tmp = format_rerank(ctx_server.vocab, tokenized_query[0], tokenized_docs[i]);
                server_task task = server_task(SERVER_TASK_TYPE_RERANK);
                task.id = ctx_server.queue_tasks.get_new_id();
                task.index = i;
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=7849aff7a2e1f4234aa31b01a1870906d5431959
+WHISPER_CPP_VERSION?=edea8a9c3cf0eb7676dcdb604991eb2f95c3d984

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -270,7 +270,6 @@
    nvidia: "cuda12-kokoro"
    intel: "intel-kokoro"
    amd: "rocm-kokoro"
-    nvidia-l4t: "nvidia-l4t-kokoro"
 - &coqui
  urls:
    - https://github.com/idiap/coqui-ai-TTS
@@ -353,7 +352,6 @@
    nvidia: "cuda12-chatterbox"
    metal: "metal-chatterbox"
    default: "cpu-chatterbox"
-    nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
 - &piper
  name: "piper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
@@ -1051,7 +1049,6 @@
    nvidia: "cuda12-kokoro-development"
    intel: "intel-kokoro-development"
    amd: "rocm-kokoro-development"
-    nvidia-l4t: "nvidia-l4t-kokoro-development"
 - !!merge <<: *kokoro
  name: "cuda11-kokoro-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-kokoro"
@@ -1077,16 +1074,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-kokoro"
  mirrors:
    - localai/localai-backends:master-gpu-intel-kokoro
- !!merge <<: *kokoro
-  name: "nvidia-l4t-kokoro"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-kokoro"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-l4t-kokoro
- !!merge <<: *kokoro
-  name: "nvidia-l4t-kokoro-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-kokoro"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-l4t-kokoro
 - !!merge <<: *kokoro
  name: "cuda11-kokoro"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-kokoro"
@@ -1240,7 +1227,6 @@
    nvidia: "cuda12-chatterbox-development"
    metal: "metal-chatterbox-development"
    default: "cpu-chatterbox-development"
-    nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
 - !!merge <<: *chatterbox
  name: "cpu-chatterbox"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-chatterbox"
@@ -1251,16 +1237,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-chatterbox"
  mirrors:
    - localai/localai-backends:master-cpu-chatterbox
- !!merge <<: *chatterbox
-  name: "nvidia-l4t-arm64-chatterbox"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox
- !!merge <<: *chatterbox
-  name: "nvidia-l4t-arm64-chatterbox-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-arm64-chatterbox"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-l4t-arm64-chatterbox
 - !!merge <<: *chatterbox
  name: "metal-chatterbox"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-chatterbox"
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.75.1
+grpcio==1.74.0
 protobuf
 certifi
--- a/backend/python/chatterbox/backend.py
+++ b/backend/python/chatterbox/backend.py
@@ -14,23 +14,9 @@ import backend_pb2_grpc
 import torch
 import torchaudio as ta
 from chatterbox.tts import ChatterboxTTS
-from chatterbox.mtl_tts import ChatterboxMultilingualTTS
+
 import grpc

-def is_float(s):
-    """Check if a string can be converted to float."""
-    try:
-        float(s)
-        return True
-    except ValueError:
-        return False
-def is_int(s):
-    """Check if a string can be converted to int."""
-    try:
-        int(s)
-        return True
-    except ValueError:
-        return False

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@@ -61,28 +47,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if not torch.cuda.is_available() and request.CUDA:
            return backend_pb2.Result(success=False, message="CUDA is not available")

-
-        options = request.Options
-
-        # empty dict
-        self.options = {}
-
-        # The options are a list of strings in this form optname:optvalue
-        # We are storing all the options in a dict so we can use it later when
-        # generating the images
-        for opt in options:
-            if ":" not in opt:
-                continue
-            key, value = opt.split(":")
-            # if value is a number, convert it to the appropriate type
-            if is_float(value):
-                value = float(value)
-            elif is_int(value):
-                value = int(value)
-            elif value.lower() in ["true", "false"]:
-                value = value.lower() == "true"
-            self.options[key] = value
-
        self.AudioPath = None

        if os.path.isabs(request.AudioPath):
@@ -92,14 +56,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            modelFileBase = os.path.dirname(request.ModelFile)
            # modify LoraAdapter to be relative to modelFileBase
            self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
+
        try:
            print("Preparing models, please wait", file=sys.stderr)
-            if "multilingual" in self.options:
-                # remove key from options
-                del self.options["multilingual"]
-                self.model = ChatterboxMultilingualTTS.from_pretrained(device=device)
-            else:
-                self.model = ChatterboxTTS.from_pretrained(device=device)
+            self.model = ChatterboxTTS.from_pretrained(device=device)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
@@ -108,18 +68,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

    def TTS(self, request, context):
        try:
-            kwargs = {}
-
-            if "language" in self.options:
-                kwargs["language_id"] = self.options["language"]
-            if self.AudioPath is not None:
-                kwargs["audio_prompt_path"] = self.AudioPath
-
-            # add options to kwargs
-            kwargs.update(self.options)
-
            # Generate audio using ChatterboxTTS
-            wav = self.model.generate(request.text, **kwargs)
+            if self.AudioPath is not None:
+                wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath)
+            else:
+                wav = self.model.generate(request.text)
+            
            # Save the generated audio
            ta.save(request.dst, wav, self.model.sr)
            
--- a/backend/python/chatterbox/install.sh
+++ b/backend/python/chatterbox/install.sh
@@ -15,6 +15,5 @@ fi
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
-EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"

 installRequirements
--- a/backend/python/chatterbox/requirements-cpu.txt
+++ b/backend/python/chatterbox/requirements-cpu.txt
@@ -1,8 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch
-torchaudio
-transformers
-# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
-#chatterbox-tts==0.1.4
+torch==2.6.0
+torchaudio==2.6.0
+transformers==4.46.3
+chatterbox-tts==0.1.2
--- a/backend/python/chatterbox/requirements-cublas11.txt
+++ b/backend/python/chatterbox/requirements-cublas11.txt
@@ -2,6 +2,5 @@
 torch==2.6.0+cu118
 torchaudio==2.6.0+cu118
 transformers==4.46.3
-# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+chatterbox-tts==0.1.2
 accelerate
--- a/backend/python/chatterbox/requirements-cublas12.txt
+++ b/backend/python/chatterbox/requirements-cublas12.txt
@@ -1,6 +1,5 @@
-torch
-torchaudio
-transformers
-# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+torch==2.6.0
+torchaudio==2.6.0
+transformers==4.46.3
+chatterbox-tts==0.1.2
 accelerate
--- a/backend/python/chatterbox/requirements-hipblas.txt
+++ b/backend/python/chatterbox/requirements-hipblas.txt
@@ -1,7 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.6.0+rocm6.1
 torchaudio==2.6.0+rocm6.1
-transformers
-# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+transformers==4.46.3
+chatterbox-tts==0.1.2
 accelerate
--- a/backend/python/chatterbox/requirements-intel.txt
+++ b/backend/python/chatterbox/requirements-intel.txt
@@ -2,9 +2,8 @@
 intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
 torchaudio==2.3.1+cxx11.abi
-transformers
-# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+transformers==4.46.3
+chatterbox-tts==0.1.2
 accelerate
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
--- a/backend/python/chatterbox/requirements-l4t.txt
+++ b/backend/python/chatterbox/requirements-l4t.txt
@@ -1,6 +0,0 @@
--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126/
-torch
-torchaudio
-transformers
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
-accelerate
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.75.1
+grpcio==1.74.0
 protobuf
 grpcio-tools
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.75.1
+grpcio==1.74.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -66,20 +66,11 @@ from diffusers.schedulers import (
 )

 def is_float(s):
-    """Check if a string can be converted to float."""
    try:
        float(s)
        return True
    except ValueError:
        return False
-def is_int(s):
-    """Check if a string can be converted to int."""
-    try:
-        int(s)
-        return True
-    except ValueError:
-        return False
-

 # The scheduler list mapping was taken from here: https://github.com/neggles/animatediff-cli/blob/6f336f5f4b5e38e85d7f06f1744ef42d0a45f2a7/src/animatediff/schedulers.py#L39
 # Credits to https://github.com/neggles
@@ -186,11 +177,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                key, value = opt.split(":")
                # if value is a number, convert it to the appropriate type
                if is_float(value):
-                    value = float(value)
-                elif is_int(value):
-                    value = int(value)
-                elif value.lower() in ["true", "false"]:
-                    value = value.lower() == "true"
+                    if value.is_integer():
+                        value = int(value)
+                    else:
+                        value = float(value)
                self.options[key] = value

            # From options, extract if present "torch_dtype" and set it to the appropriate type
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.75.1
+grpcio==1.74.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.75.1
+grpcio==1.74.0
 protobuf
 certifi
 wheel
--- a/backend/python/kokoro/requirements-l4t.txt
+++ b/backend/python/kokoro/requirements-l4t.txt
@@ -1,7 +0,0 @@
--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126/
-torch
-torchaudio
-transformers
-accelerate
-kokoro
-soundfile
--- a/backend/python/mlx-audio/backend.py
+++ b/backend/python/mlx-audio/backend.py
@@ -20,21 +20,6 @@ import soundfile as sf
 import numpy as np
 import uuid

-def is_float(s):
-    """Check if a string can be converted to float."""
-    try:
-        float(s)
-        return True
-    except ValueError:
-        return False
-def is_int(s):
-    """Check if a string can be converted to int."""
-    try:
-        int(s)
-        return True
-    except ValueError:
-        return False
-
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -47,6 +32,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    This backend provides TTS (Text-to-Speech) functionality using MLX-Audio.
    """

+    def _is_float(self, s):
+        """Check if a string can be converted to float."""
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+
    def Health(self, request, context):
        """
        Returns a health check message.
@@ -87,10 +80,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
                
                # Convert numeric values to appropriate types
-                if is_float(value):
-                    value = float(value)
-                elif is_int(value):
-                    value = int(value)
+                if self._is_float(value):
+                    if float(value).is_integer():
+                        value = int(value)
+                    else:
+                        value = float(value)
                elif value.lower() in ["true", "false"]:
                    value = value.lower() == "true"
                    
--- a/backend/python/mlx-vlm/backend.py
+++ b/backend/python/mlx-vlm/backend.py
@@ -21,21 +21,6 @@ import io
 from PIL import Image
 import tempfile

-def is_float(s):
-    """Check if a string can be converted to float."""
-    try:
-        float(s)
-        return True
-    except ValueError:
-        return False
-def is_int(s):
-    """Check if a string can be converted to int."""
-    try:
-        int(s)
-        return True
-    except ValueError:
-        return False
-
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -47,6 +32,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    A gRPC servicer that implements the Backend service defined in backend.proto.
    """

+    def _is_float(self, s):
+        """Check if a string can be converted to float."""
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+
    def Health(self, request, context):
        """
        Returns a health check message.
@@ -86,10 +79,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    continue
                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
                
-                if is_float(value):
-                    value = float(value)
-                elif is_int(value):
-                    value = int(value)
+                # Convert numeric values to appropriate types
+                if self._is_float(value):
+                    if float(value).is_integer():
+                        value = int(value)
+                    else:
+                        value = float(value)
                elif value.lower() in ["true", "false"]:
                    value = value.lower() == "true"
                    
--- a/backend/python/mlx/backend.py
+++ b/backend/python/mlx/backend.py
@@ -24,27 +24,20 @@ _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))

-def is_float(s):
-    """Check if a string can be converted to float."""
-    try:
-        float(s)
-        return True
-    except ValueError:
-        return False
-def is_int(s):
-    """Check if a string can be converted to int."""
-    try:
-        int(s)
-        return True
-    except ValueError:
-        return False
-
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    A gRPC servicer that implements the Backend service defined in backend.proto.
    """

+    def _is_float(self, s):
+        """Check if a string can be converted to float."""
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+
    def Health(self, request, context):
        """
        Returns a health check message.
@@ -85,10 +78,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
                
                # Convert numeric values to appropriate types
-                if is_float(value):
-                    value = float(value)
-                elif is_int(value):
-                    value = int(value)
+                if self._is_float(value):
+                    if float(value).is_integer():
+                        value = int(value)
+                    else:
+                        value = float(value)
                elif value.lower() in ["true", "false"]:
                    value = value.lower() == "true"
                    
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.75.1
+grpcio==1.74.0
 protobuf
 certifi
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.75.1
+grpcio==1.74.0
 protobuf==6.32.0
 certifi
 setuptools
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.75.1
+grpcio==1.74.0
 protobuf
 certifi
 setuptools
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v3.5.4"
+  "version": "v3.5.0"
 }
--- a/gallery/granite4.yaml
+++ b/gallery/granite4.yaml
@@ -1,48 +0,0 @@
---
-name: "granite-3.2"
-
-config_file: |
-  backend: "llama-cpp"
-  mmap: true
-  template:
-    chat_message: |
-      <|start_of_role|>{{ .RoleName }}<|end_of_role|>
-      {{ if .FunctionCall -}}
-      <tool_call>
-      {{ else if eq .RoleName "tool" -}}
-      <tool_response>
-      {{ end -}}
-      {{ if .Content -}}
-      {{.Content }}
-      {{ end -}}
-      {{ if eq .RoleName "tool" -}}
-      </tool_response>
-      {{ end -}}
-      {{ if .FunctionCall -}}
-      {{toJson .FunctionCall}}
-      </tool_call>
-      {{ end -}}
-      <|end_of_text|>
-    function: |
-      <|start_of_role|>system<|end_of_role|>
-      You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
-
-      Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.
-      {{range .Functions}}
-      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-      {{end}}
-      For each function call return a json object with function name and arguments
-      {{.Input -}}
-      <|start_of_role|>assistant<|end_of_role|>
-    chat: |
-      {{.Input -}}
-      <|start_of_role|>assistant<|end_of_role|>
-    completion: |
-      {{.Input}}
-  context_size: 8192
-  f16: true
-  stopwords:
-  - '<|im_end|>'
-  - '<dummy32000>'
-  - '</s>'
-  - '<|end_of_text|>'
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,68 +1,4 @@
 ---
- &granite4
-  url: "github:mudler/LocalAI/gallery/granite4.yaml@master"
-  name: "ibm-granite_granite-4.0-h-small"
-  license: apache-2.0
-  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/639bcaa2445b133a4e942436/CEW-OjXkRkDNmTxSu8Egh.png
-  tags:
-    - gguf
-    - GPU
-    - CPU
-    - text-to-text
-  urls:
-    - https://huggingface.co/ibm-granite/granite-4.0-h-small
-    - https://huggingface.co/bartowski/ibm-granite_granite-4.0-h-small-GGUF
-  description: |
-      Granite-4.0-H-Small is a 32B parameter long-context instruct model finetuned from Granite-4.0-H-Small-Base using a combination of open source instruction datasets with permissive license and internally collected synthetic datasets. This model is developed using a diverse set of techniques with a structured chat format, including supervised finetuning, model alignment using reinforcement learning, and model merging. Granite 4.0 instruct models feature improved instruction following (IF) and tool-calling capabilities, making them more effective in enterprise applications.
-  overrides:
-    parameters:
-      model: ibm-granite_granite-4.0-h-small-Q4_K_M.gguf
-  files:
-    - filename: ibm-granite_granite-4.0-h-small-Q4_K_M.gguf
-      sha256: c59ce76239bd5794acdbdf88616dfc296247f4e78792a9678d4b3e24966ead69
-      uri: huggingface://bartowski/ibm-granite_granite-4.0-h-small-GGUF/ibm-granite_granite-4.0-h-small-Q4_K_M.gguf
- !!merge <<: *granite4
-  name: "ibm-granite_granite-4.0-h-tiny"
-  urls:
-    - https://huggingface.co/ibm-granite/granite-4.0-h-tiny
-    - https://huggingface.co/bartowski/ibm-granite_granite-4.0-h-tiny-GGUF
-  description: |
-     Granite-4.0-H-Tiny is a 7B parameter long-context instruct model finetuned from Granite-4.0-H-Tiny-Base using a combination of open source instruction datasets with permissive license and internally collected synthetic datasets. This model is developed using a diverse set of techniques with a structured chat format, including supervised finetuning, model alignment using reinforcement learning, and model merging. Granite 4.0 instruct models feature improved instruction following (IF) and tool-calling capabilities, making them more effective in enterprise applications.
-  overrides:
-    parameters:
-      model: ibm-granite_granite-4.0-h-tiny-Q4_K_M.gguf
-  files:
-    - filename: ibm-granite_granite-4.0-h-tiny-Q4_K_M.gguf
-      sha256: 33a689fe7f35b14ebab3ae599b65aaa3ed8548c393373b1b0eebee36c653146f
-      uri: huggingface://bartowski/ibm-granite_granite-4.0-h-tiny-GGUF/ibm-granite_granite-4.0-h-tiny-Q4_K_M.gguf
- !!merge <<: *granite4
-  name: "ibm-granite_granite-4.0-h-micro"
-  urls:
-    - https://huggingface.co/ibm-granite/granite-4.0-h-micro
-    - https://huggingface.co/bartowski/ibm-granite_granite-4.0-h-micro-GGUF
-  description: |
-    Granite-4.0-H-Micro is a 3B parameter long-context instruct model finetuned from Granite-4.0-H-Micro-Base using a combination of open source instruction datasets with permissive license and internally collected synthetic datasets. This model is developed using a diverse set of techniques with a structured chat format, including supervised finetuning, model alignment using reinforcement learning, and model merging. Granite 4.0 instruct models feature improved instruction following (IF) and tool-calling capabilities, making them more effective in enterprise applications.
-  overrides:
-    parameters:
-      model: ibm-granite_granite-4.0-h-micro-Q4_K_M.gguf
-  files:
-    - filename: ibm-granite_granite-4.0-h-micro-Q4_K_M.gguf
-      sha256: 48376d61449687a56b3811a418d92cc0e8e77b4d96ec13eb6c9d9503968c9f20
-      uri: huggingface://bartowski/ibm-granite_granite-4.0-h-micro-GGUF/ibm-granite_granite-4.0-h-micro-Q4_K_M.gguf
- !!merge <<: *granite4
-  name: "ibm-granite_granite-4.0-micro"
-  urls:
-    - https://huggingface.co/ibm-granite/granite-4.0-micro
-    - https://huggingface.co/bartowski/ibm-granite_granite-4.0-micro-GGUF
-  description: |
-    Granite-4.0-Micro is a 3B parameter long-context instruct model finetuned from Granite-4.0-Micro-Base using a combination of open source instruction datasets with permissive license and internally collected synthetic datasets. This model is developed using a diverse set of techniques with a structured chat format, including supervised finetuning, model alignment using reinforcement learning, and model merging. Granite 4.0 instruct models feature improved instruction following (IF) and tool-calling capabilities, making them more effective in enterprise applications.
-  overrides:
-    parameters:
-      model: ibm-granite_granite-4.0-micro-Q4_K_M.gguf
-  files:
-    - filename: ibm-granite_granite-4.0-micro-Q4_K_M.gguf
-      sha256: bd9d7b4795b9dc44e3e81aeae93bb5d8e6b891b7e823be5bf9910ed3ac060baf
-      uri: huggingface://bartowski/ibm-granite_granite-4.0-micro-GGUF/ibm-granite_granite-4.0-micro-Q4_K_M.gguf
 - &ernie
  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
  name: "baidu_ernie-4.5-21b-a3b-thinking"
@@ -399,7 +335,7 @@
  url: "github:mudler/LocalAI/gallery/qwen-image.yaml@master"
  urls:
    - https://huggingface.co/Qwen/Qwen-Image-Edit
-  icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png
+  icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_logo.png
  license: apache-2.0
  tags:
    - qwen-image
@@ -414,26 +350,6 @@
      cuda: true
      pipeline_type: QwenImageEditPipeline
      enable_parameters: num_inference_steps,image
- !!merge <<: *qwenimage
-  name: "qwen-image-edit-2509"
-  url: "github:mudler/LocalAI/gallery/qwen-image.yaml@master"
-  urls:
-    - https://huggingface.co/Qwen/Qwen-Image-Edit-2509
-  icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png
-  license: apache-2.0
-  tags:
-    - qwen-image
-    - gpu
-    - image-to-image
-  description: |
-    Qwen-Image-Edit is a model for image editing, which is based on Qwen-Image.
-  overrides:
-    parameters:
-      model: Qwen/Qwen-Image-Edit-2509
-    diffusers:
-      cuda: true
-      pipeline_type: QwenImageEditPipeline
-      enable_parameters: num_inference_steps,image
 - &gptoss
  name: "gpt-oss-20b"
  url: "github:mudler/LocalAI/gallery/harmony.yaml@master"
@@ -2722,39 +2638,6 @@
    - filename: Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf
      sha256: 1afefb3b369ea2de191f24fe8ea22cbbb7b412357902f27bd81d693dde35c2d9
      uri: huggingface://bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf
- !!merge <<: *qwen3
-  name: "impish_qwen_14b-1m"
-  icon: https://huggingface.co/SicariusSicariiStuff/Impish_QWEN_14B-1M/resolve/main/Images/Impish_Qwen_14B.png
-  urls:
-    - https://huggingface.co/SicariusSicariiStuff/Impish_QWEN_14B-1M
-    - https://huggingface.co/mradermacher/Impish_QWEN_14B-1M-GGUF
-  description: |
-    Supreme context One million tokens to play with.
-    Strong Roleplay internet RP format lovers will appriciate it, medium size paragraphs.
-    Qwen smarts built-in, but naughty and playful Maybe it's even too naughty.
-    VERY compliant with low censorship.
-    VERY high IFeval for a 14B RP model: 78.68.
-  overrides:
-    parameters:
-      model: Impish_QWEN_14B-1M.Q4_K_M.gguf
-  files:
-    - filename: Impish_QWEN_14B-1M.Q4_K_M.gguf
-      sha256: d326f2b8f05814ea3943c82498f0cd3cde64859cf03f532855c87fb94b0da79e
-      uri: huggingface://mradermacher/Impish_QWEN_14B-1M-GGUF/Impish_QWEN_14B-1M.Q4_K_M.gguf
- !!merge <<: *qwen3
-  name: "aquif-3.5-a4b-think"
-  urls:
-    - https://huggingface.co/aquif-ai/aquif-3.5-A4B-Think
-    - https://huggingface.co/QuantFactory/aquif-3.5-A4B-Think-GGUF
-  description: |
-    The aquif-3.5 series is the successor to aquif-3, featuring a simplified naming scheme, expanded Mixture of Experts (MoE) options, and across-the-board performance improvements. This release streamlines model selection while delivering enhanced capabilities across reasoning, multilingual support, and general intelligence tasks.
-  overrides:
-    parameters:
-      model: aquif-3.5-A4B-Think.Q4_K_M.gguf
-  files:
-    - filename: aquif-3.5-A4B-Think.Q4_K_M.gguf
-      sha256: 1650b72ae1acf12b45a702f2ff5f47205552e494f0d910e81cbe40dfba55a6b9
-      uri: huggingface://QuantFactory/aquif-3.5-A4B-Think-GGUF/aquif-3.5-A4B-Think.Q4_K_M.gguf
 - &gemma3
  url: "github:mudler/LocalAI/gallery/gemma.yaml@master"
  name: "gemma-3-27b-it"
@@ -15292,27 +15175,6 @@
    - filename: Impish_Longtail_12B-Q4_K_M.gguf
      sha256: 2cf0cacb65d71cfc5b4255f3273ad245bbcb11956a0f9e3aaa0e739df57c90df
      uri: huggingface://SicariusSicariiStuff/Impish_Longtail_12B_GGUF/Impish_Longtail_12B-Q4_K_M.gguf
- !!merge <<: *mistral03
-  name: "mistralai_magistral-small-2509"
-  urls:
-    - https://huggingface.co/mistralai/Magistral-Small-2509
-    - https://huggingface.co/bartowski/mistralai_Magistral-Small-2509-GGUF
-  description: |
-    Magistral Small 1.2
-    Building upon Mistral Small 3.2 (2506), with added reasoning capabilities, undergoing SFT from Magistral Medium traces and RL on top, it's a small, efficient reasoning model with 24B parameters.
-
-    Magistral Small can be deployed locally, fitting within a single RTX 4090 or a 32GB RAM MacBook once quantized.
-
-    Learn more about Magistral in our blog post.
-
-    The model was presented in the paper Magistral.
-  overrides:
-    parameters:
-      model: mistralai_Magistral-Small-2509-Q4_K_M.gguf
-  files:
-    - filename: mistralai_Magistral-Small-2509-Q4_K_M.gguf
-      sha256: 1d638bc931de30d29fc73ad439206ff185f76666a096e7ad723866a20f78728d
-      uri: huggingface://bartowski/mistralai_Magistral-Small-2509-GGUF/mistralai_Magistral-Small-2509-Q4_K_M.gguf
 - &mudler
  url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
  name: "LocalAI-llama3-8b-function-call-v0.2"
@@ -20474,9 +20336,9 @@
    - https://huggingface.co/ggerganov/whisper.cpp
  overrides:
    parameters:
-      model: ggml-base.bin
+      model: ggml-whisper-base.bin
  files:
-    - filename: "ggml-base.bin"
+    - filename: "ggml-whisper-base.bin"
      sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
      uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
  description: |
@@ -20521,20 +20383,11 @@
  name: "whisper-large-q5_0"
  overrides:
    parameters:
-      model: ggml-large-v3-q5_0.bin
+      model: ggml-large-q5_0.bin
  files:
-    - filename: "ggml-large-v3-q5_0.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-q5_0.bin"
-      sha256: d75795ecff3f83b5faa89d1900604ad8c780abd5739fae406de19f23ecd98ad1
- !!merge <<: *whisper
-  name: "whisper-medium"
-  overrides:
-    parameters:
-      model: ggml-medium.bin
-  files:
-    - filename: "ggml-medium.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-medium.bin"
-      sha256: 6c14d5adee5f86394037b4e4e8b59f1673b6cee10e3cf0b11bbdbee79c156208
+    - filename: "ggml-large-q5_0.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-q5_0.bin"
+      sha256: 3a214837221e4530dbc1fe8d734f302af393eb30bd0ed046042ebf4baf70f6f2
 - !!merge <<: *whisper
  name: "whisper-medium-q5_0"
  overrides:
@@ -20562,6 +20415,15 @@
    - filename: "ggml-small.bin"
      uri: "huggingface://ggerganov/whisper.cpp/ggml-small.bin"
      sha256: 1be3a9b2063867b937e64e2ec7483364a79917e157fa98c5d94b5c1fffea987b
+- !!merge <<: *whisper
+  name: "whisper-small-en-tdrz"
+  overrides:
+    parameters:
+      model: ggml-small.en-tdrz.bin
+  files:
+    - filename: "ggml-small.bin"
+      uri: "huggingface://akashmjn/tinydiarize-whisper.cpp/ggml-small.en-tdrz.bin"
+      sha256: ceac3ec06d1d98ef71aec665283564631055fd6129b79d8e1be4f9cc33cc54b4
 - !!merge <<: *whisper
  name: "whisper-small-en-q5_1"
  overrides:
@@ -20634,51 +20496,6 @@
    - filename: "ggml-tiny.en-q8_0.bin"
      uri: "huggingface://ggerganov/whisper.cpp/ggml-tiny.en-q8_0.bin"
      sha256: 5bc2b3860aa151a4c6e7bb095e1fcce7cf12c7b020ca08dcec0c6d018bb7dd94
- !!merge <<: *whisper
-  name: "whisper-large"
-  overrides:
-    parameters:
-      model: ggml-large-v3.bin
-  files:
-    - filename: "ggml-large-v3.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3.bin"
-      sha256: 64d182b440b98d5203c4f9bd541544d84c605196c4f7b845dfa11fb23594d1e2
- !!merge <<: *whisper
-  name: "whisper-large-q5_0"
-  overrides:
-    parameters:
-      model: ggml-large-v3-q5_0.bin
-  files:
-    - filename: "ggml-large-v3-q5_0.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-q5_0.bin"
-      sha256: d75795ecff3f83b5faa89d1900604ad8c780abd5739fae406de19f23ecd98ad1
- !!merge <<: *whisper
-  name: "whisper-large-turbo"
-  overrides:
-    parameters:
-      model: ggml-large-v3-turbo.bin
-  files:
-    - filename: "ggml-large-v3-turbo.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-turbo.bin"
-      sha256: 1fc70f774d38eb169993ac391eea357ef47c88757ef72ee5943879b7e8e2bc69
- !!merge <<: *whisper
-  name: "whisper-large-turbo-q5_0"
-  overrides:
-    parameters:
-      model: ggml-large-v3-turbo-q5_0.bin
-  files:
-    - filename: "ggml-large-v3-turbo-q5_0.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-turbo-q5_0.bin"
-      sha256: 394221709cd5ad1f40c46e6031ca61bce88931e6e088c188294c6d5a55ffa7e2
- !!merge <<: *whisper
-  name: "whisper-large-turbo-q8_0"
-  overrides:
-    parameters:
-      model: ggml-large-v3-turbo-q8_0.bin
-  files:
-    - filename: "ggml-large-v3-turbo-q8_0.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-turbo-q8_0.bin"
-      sha256: 317eb69c11673c9de1e1f0d459b253999804ec71ac4c23c17ecf5fbe24e259a1
 ## Bert embeddings (llama3.2 drop-in)
 - !!merge <<: *llama32
  name: "bert-embeddings"
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -95,7 +95,6 @@ var knownModelsNameSuffixToSkip []string = []string{
 	".DS_Store",
 	".",
 	".safetensors",
-	".bin",
 	".partial",
 	".tar.gz",
 }