WIP

2026-02-03 03:02:38 -05:00 · 2025-09-17 21:52:53 +02:00
31 changed files with 1433 additions and 995 deletions
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -34,6 +34,15 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-gpu-nvidia-cuda-13'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -60,7 +69,7 @@ jobs:
            runs-on: 'ubuntu-latest'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'vulkan'
-            platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-vulkan-core'
            runs-on: 'ubuntu-latest'
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -100,8 +100,19 @@ jobs:
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            aio: "-aio-gpu-nvidia-cuda-12"
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            makeflags: "--jobs=4 --output-sync=target"
+            aio: "-aio-gpu-nvidia-cuda-13"
          - build-type: 'vulkan'
-            platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-vulkan'
            runs-on: 'ubuntu-latest'
--- a/.github/workflows/localaibot_automerge.yml
+++ b/.github/workflows/localaibot_automerge.yml
@@ -6,8 +6,7 @@ permissions:
  contents: write
  pull-requests: write
  packages: read
-  issues: write # for Homebrew/actions/post-comment
-  actions: write # to dispatch publish workflow
+
 jobs:
  dependabot:
    runs-on: ubuntu-latest
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.9
+        uses: securego/gosec@v2.22.8
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/40
+++ b/40
@@ -32,27 +32,15 @@ RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils sudo wget gpg-agent curl xz-utils && \
-            echo "vulkan" > /run/localai/capability && \
-        if [ "amd64" = "$TARGETARCH" ]; then
-            wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-            wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-            apt-get update && \
-            apt-get install -y \
-                vulkan-sdk && \
-            apt-get clean && \
-            rm -rf /var/lib/apt/lists/*
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            # For ARM64, we need to build the Vulkan SDK manually as there are no packages available
-            mkdir vulkan && cd vulkan && curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.321.1/vulkansdk-ubuntu-22.04-arm-1.4.321.1.tar.xz && \
-            tar -xvf vulkan-sdk.tar.xz && \
-            rm vulkan-sdk.tar.xz && \
-            cd * && \
-            cp -rfv aarch64/* /usr/ && \
-            cd ../.. && \
-            rm -rf vulkan
-        fi
+            software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+        apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        echo "vulkan" > /run/localai/capability
    fi
 EOT

@@ -90,16 +78,6 @@ RUN <<EOT bash
    fi
 EOT

-# https://github.com/NVIDIA/Isaac-GR00T/issues/343
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
-        wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu2204-0.6.0_0.6.0-1_arm64.deb && \
-        dpkg -i cudss-local-tegra-repo-ubuntu2204-0.6.0_0.6.0-1_arm64.deb && \
-        cp /var/cudss-local-tegra-repo-ubuntu2204-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get -y install cudss
-    fi
-EOT
-
 # If we are building with clblas support, we need the libraries for the builds
 RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
--- a/3
+++ b/3
@@ -429,9 +429,6 @@ docker-build-kitten-tts:
 docker-save-kitten-tts: backend-images
 	docker save local-ai-backend:kitten-tts -o backend-images/kitten-tts.tar

-docker-save-chatterbox: backend-images
-	docker save local-ai-backend:chatterbox -o backend-images/chatterbox.tar
-
 docker-build-kokoro:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend

--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -37,27 +37,14 @@ RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils sudo wget gpg-agent curl xz-utils && \
-            echo "vulkan" > /run/localai/capability && \
-        if [ "amd64" = "$TARGETARCH" ]; then
-            wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-            wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-            apt-get update && \
-            apt-get install -y \
-                vulkan-sdk && \
-            apt-get clean && \
-            rm -rf /var/lib/apt/lists/*
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            # For ARM64, we need to build the Vulkan SDK manually as there are no packages available
-            mkdir vulkan && cd vulkan && curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.321.1/vulkansdk-ubuntu-22.04-arm-1.4.321.1.tar.xz && \
-            tar -xvf vulkan-sdk.tar.xz && \
-            rm vulkan-sdk.tar.xz && \
-            cd * && \
-            cp -rfv aarch64/* /usr/ && \
-            cd ../.. && \
-            rm -rf vulkan
-        fi
+            software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+        apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
    fi
 EOT

--- a/backend/Dockerfile.llama-cpp
+++ b/backend/Dockerfile.llama-cpp
@@ -85,27 +85,14 @@ RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils sudo wget gpg-agent curl xz-utils libxcb1 libx11-6 && \
-            echo "vulkan" > /run/localai/capability && \
-        if [ "amd64" = "$TARGETARCH" ]; then
-            wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-            wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-            apt-get update && \
-            apt-get install -y \
-                vulkan-sdk && \
-            apt-get clean && \
-            rm -rf /var/lib/apt/lists/*
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            # For ARM64, we need to build the Vulkan SDK manually as there are no packages available
-            mkdir vulkan && cd vulkan && curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.321.1/vulkansdk-ubuntu-22.04-arm-1.4.321.1.tar.xz && \
-            tar -xvf vulkan-sdk.tar.xz && \
-            rm vulkan-sdk.tar.xz && \
-            cd * && \
-            cp -rfv aarch64/* /usr/ && vulkaninfo \
-            cd ../.. && \
-            rm -rf vulkan
-        fi
+            software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+        apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
    fi
 EOT

--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -45,27 +45,14 @@ RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils sudo wget gpg-agent curl xz-utils && \
-            echo "vulkan" > /run/localai/capability && \
-        if [ "amd64" = "$TARGETARCH" ]; then
-            wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-            wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-            apt-get update && \
-            apt-get install -y \
-                vulkan-sdk && \
-            apt-get clean && \
-            rm -rf /var/lib/apt/lists/*
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            # For ARM64, we need to build the Vulkan SDK manually as there are no packages available
-            mkdir vulkan && cd vulkan && curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.321.1/vulkansdk-ubuntu-22.04-arm-1.4.321.1.tar.xz && \
-            tar -xvf vulkan-sdk.tar.xz && \
-            rm vulkan-sdk.tar.xz && \
-            cd * && \
-            cp -rfv aarch64/* /usr/ && \
-            cd ../.. && \
-            rm -rf vulkan
-        fi
+            software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+        apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
    fi
 EOT

--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=4807e8f96a61b2adccebd5e57444c94d18de7264
+LLAMA_VERSION?=8ff206097c2bf3ca1c7aa95f9d6db779fc7bdd68
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
@@ -14,7 +14,7 @@ CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF

 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF -DLLAMA_OPENSSL=OFF
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -231,7 +231,6 @@ static void params_parse(const backend::ModelOptions* request,
    params.cpuparams.n_threads = request->threads();
    params.n_gpu_layers = request->ngpulayers();
    params.n_batch = request->nbatch();
-    params.n_ubatch = request->nbatch(); // fixes issue with reranking models being limited to 512 tokens (the default n_ubatch size); allows for setting the maximum input amount of tokens thereby avoiding this error "input is too large to process. increase the physical batch size"
    // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
    //params.n_parallel = 1;
    const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
@@ -802,6 +801,11 @@ public:
            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"documents\" must be a non-empty string array");
        }

+        // Tokenize the query
+        auto tokenized_query = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, request->query(), /* add_special */ false, true);
+        if (tokenized_query.size() != 1) {
+            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"query\" must contain only a single prompt");
+        }
        // Create and queue the task
        json responses = json::array();
        bool error = false;
@@ -813,9 +817,10 @@ public:
                documents.push_back(request->documents(i));
            }
            
-            tasks.reserve(documents.size());
-            for (size_t i = 0; i < documents.size(); i++) {
-                auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, request->query(), documents[i]);
+            auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
+            tasks.reserve(tokenized_docs.size());
+            for (size_t i = 0; i < tokenized_docs.size(); i++) {
+                auto tmp = format_rerank(ctx_server.vocab, tokenized_query[0], tokenized_docs[i]);
                server_task task = server_task(SERVER_TASK_TYPE_RERANK);
                task.id = ctx_server.queue_tasks.get_new_id();
                task.index = i;
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=44fa2f647cf2a6953493b21ab83b50d5f5dbc483
+WHISPER_CPP_VERSION?=edea8a9c3cf0eb7676dcdb604991eb2f95c3d984

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -270,7 +270,6 @@
    nvidia: "cuda12-kokoro"
    intel: "intel-kokoro"
    amd: "rocm-kokoro"
-    nvidia-l4t: "nvidia-l4t-kokoro"
 - &coqui
  urls:
    - https://github.com/idiap/coqui-ai-TTS
@@ -353,7 +352,6 @@
    nvidia: "cuda12-chatterbox"
    metal: "metal-chatterbox"
    default: "cpu-chatterbox"
-    nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
 - &piper
  name: "piper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
@@ -1051,7 +1049,6 @@
    nvidia: "cuda12-kokoro-development"
    intel: "intel-kokoro-development"
    amd: "rocm-kokoro-development"
-    nvidia-l4t: "nvidia-l4t-kokoro-development"
 - !!merge <<: *kokoro
  name: "cuda11-kokoro-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-kokoro"
@@ -1077,16 +1074,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-kokoro"
  mirrors:
    - localai/localai-backends:master-gpu-intel-kokoro
- !!merge <<: *kokoro
-  name: "nvidia-l4t-kokoro"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-kokoro"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-l4t-kokoro
- !!merge <<: *kokoro
-  name: "nvidia-l4t-kokoro-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-kokoro"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-l4t-kokoro
 - !!merge <<: *kokoro
  name: "cuda11-kokoro"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-kokoro"
@@ -1240,7 +1227,6 @@
    nvidia: "cuda12-chatterbox-development"
    metal: "metal-chatterbox-development"
    default: "cpu-chatterbox-development"
-    nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
 - !!merge <<: *chatterbox
  name: "cpu-chatterbox"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-chatterbox"
@@ -1251,16 +1237,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-chatterbox"
  mirrors:
    - localai/localai-backends:master-cpu-chatterbox
- !!merge <<: *chatterbox
-  name: "nvidia-l4t-arm64-chatterbox"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox
- !!merge <<: *chatterbox
-  name: "nvidia-l4t-arm64-chatterbox-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-arm64-chatterbox"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-l4t-arm64-chatterbox
 - !!merge <<: *chatterbox
  name: "metal-chatterbox"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-chatterbox"
--- a/backend/python/chatterbox/backend.py
+++ b/backend/python/chatterbox/backend.py
@@ -14,23 +14,9 @@ import backend_pb2_grpc
 import torch
 import torchaudio as ta
 from chatterbox.tts import ChatterboxTTS
-from chatterbox.mtl_tts import ChatterboxMultilingualTTS
+
 import grpc

-def is_float(s):
-    """Check if a string can be converted to float."""
-    try:
-        float(s)
-        return True
-    except ValueError:
-        return False
-def is_int(s):
-    """Check if a string can be converted to int."""
-    try:
-        int(s)
-        return True
-    except ValueError:
-        return False

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@@ -61,28 +47,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if not torch.cuda.is_available() and request.CUDA:
            return backend_pb2.Result(success=False, message="CUDA is not available")

-
-        options = request.Options
-
-        # empty dict
-        self.options = {}
-
-        # The options are a list of strings in this form optname:optvalue
-        # We are storing all the options in a dict so we can use it later when
-        # generating the images
-        for opt in options:
-            if ":" not in opt:
-                continue
-            key, value = opt.split(":")
-            # if value is a number, convert it to the appropriate type
-            if is_float(value):
-                value = float(value)
-            elif is_int(value):
-                value = int(value)
-            elif value.lower() in ["true", "false"]:
-                value = value.lower() == "true"
-            self.options[key] = value
-
        self.AudioPath = None

        if os.path.isabs(request.AudioPath):
@@ -92,14 +56,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            modelFileBase = os.path.dirname(request.ModelFile)
            # modify LoraAdapter to be relative to modelFileBase
            self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
+
        try:
            print("Preparing models, please wait", file=sys.stderr)
-            if "multilingual" in self.options:
-                # remove key from options
-                del self.options["multilingual"]
-                self.model = ChatterboxMultilingualTTS.from_pretrained(device=device)
-            else:
-                self.model = ChatterboxTTS.from_pretrained(device=device)
+            self.model = ChatterboxTTS.from_pretrained(device=device)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
@@ -108,18 +68,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

    def TTS(self, request, context):
        try:
-            kwargs = {}
-
-            if "language" in self.options:
-                kwargs["language_id"] = self.options["language"]
-            if self.AudioPath is not None:
-                kwargs["audio_prompt_path"] = self.AudioPath
-
-            # add options to kwargs
-            kwargs.update(self.options)
-
            # Generate audio using ChatterboxTTS
-            wav = self.model.generate(request.text, **kwargs)
+            if self.AudioPath is not None:
+                wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath)
+            else:
+                wav = self.model.generate(request.text)
+            
            # Save the generated audio
            ta.save(request.dst, wav, self.model.sr)
            
--- a/backend/python/chatterbox/install.sh
+++ b/backend/python/chatterbox/install.sh
@@ -15,6 +15,5 @@ fi
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
-EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"

 installRequirements
--- a/backend/python/chatterbox/requirements-cpu.txt
+++ b/backend/python/chatterbox/requirements-cpu.txt
@@ -1,8 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch
-torchaudio
-transformers
-# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
-#chatterbox-tts==0.1.4
+torch==2.6.0
+torchaudio==2.6.0
+transformers==4.46.3
+chatterbox-tts==0.1.2
--- a/backend/python/chatterbox/requirements-cublas11.txt
+++ b/backend/python/chatterbox/requirements-cublas11.txt
@@ -2,6 +2,5 @@
 torch==2.6.0+cu118
 torchaudio==2.6.0+cu118
 transformers==4.46.3
-# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+chatterbox-tts==0.1.2
 accelerate
--- a/backend/python/chatterbox/requirements-cublas12.txt
+++ b/backend/python/chatterbox/requirements-cublas12.txt
@@ -1,6 +1,5 @@
-torch
-torchaudio
-transformers
-# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+torch==2.6.0
+torchaudio==2.6.0
+transformers==4.46.3
+chatterbox-tts==0.1.2
 accelerate
--- a/backend/python/chatterbox/requirements-hipblas.txt
+++ b/backend/python/chatterbox/requirements-hipblas.txt
@@ -1,7 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.6.0+rocm6.1
 torchaudio==2.6.0+rocm6.1
-transformers
-# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+transformers==4.46.3
+chatterbox-tts==0.1.2
 accelerate
--- a/backend/python/chatterbox/requirements-intel.txt
+++ b/backend/python/chatterbox/requirements-intel.txt
@@ -2,9 +2,8 @@
 intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
 torchaudio==2.3.1+cxx11.abi
-transformers
-# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+transformers==4.46.3
+chatterbox-tts==0.1.2
 accelerate
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
--- a/backend/python/chatterbox/requirements-l4t.txt
+++ b/backend/python/chatterbox/requirements-l4t.txt
@@ -1,6 +0,0 @@
--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126/
-torch
-torchaudio
-transformers
-chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
-accelerate
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -66,20 +66,11 @@ from diffusers.schedulers import (
 )

 def is_float(s):
-    """Check if a string can be converted to float."""
    try:
        float(s)
        return True
    except ValueError:
        return False
-def is_int(s):
-    """Check if a string can be converted to int."""
-    try:
-        int(s)
-        return True
-    except ValueError:
-        return False
-

 # The scheduler list mapping was taken from here: https://github.com/neggles/animatediff-cli/blob/6f336f5f4b5e38e85d7f06f1744ef42d0a45f2a7/src/animatediff/schedulers.py#L39
 # Credits to https://github.com/neggles
@@ -186,11 +177,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                key, value = opt.split(":")
                # if value is a number, convert it to the appropriate type
                if is_float(value):
-                    value = float(value)
-                elif is_int(value):
-                    value = int(value)
-                elif value.lower() in ["true", "false"]:
-                    value = value.lower() == "true"
+                    if value.is_integer():
+                        value = int(value)
+                    else:
+                        value = float(value)
                self.options[key] = value

            # From options, extract if present "torch_dtype" and set it to the appropriate type
--- a/backend/python/kokoro/requirements-l4t.txt
+++ b/backend/python/kokoro/requirements-l4t.txt
@@ -1,7 +0,0 @@
--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126/
-torch
-torchaudio
-transformers
-accelerate
-kokoro
-soundfile
--- a/backend/python/mlx-audio/backend.py
+++ b/backend/python/mlx-audio/backend.py
@@ -20,21 +20,6 @@ import soundfile as sf
 import numpy as np
 import uuid

-def is_float(s):
-    """Check if a string can be converted to float."""
-    try:
-        float(s)
-        return True
-    except ValueError:
-        return False
-def is_int(s):
-    """Check if a string can be converted to int."""
-    try:
-        int(s)
-        return True
-    except ValueError:
-        return False
-
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -47,6 +32,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    This backend provides TTS (Text-to-Speech) functionality using MLX-Audio.
    """

+    def _is_float(self, s):
+        """Check if a string can be converted to float."""
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+
    def Health(self, request, context):
        """
        Returns a health check message.
@@ -87,10 +80,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
                
                # Convert numeric values to appropriate types
-                if is_float(value):
-                    value = float(value)
-                elif is_int(value):
-                    value = int(value)
+                if self._is_float(value):
+                    if float(value).is_integer():
+                        value = int(value)
+                    else:
+                        value = float(value)
                elif value.lower() in ["true", "false"]:
                    value = value.lower() == "true"
                    
--- a/backend/python/mlx-vlm/backend.py
+++ b/backend/python/mlx-vlm/backend.py
@@ -21,21 +21,6 @@ import io
 from PIL import Image
 import tempfile

-def is_float(s):
-    """Check if a string can be converted to float."""
-    try:
-        float(s)
-        return True
-    except ValueError:
-        return False
-def is_int(s):
-    """Check if a string can be converted to int."""
-    try:
-        int(s)
-        return True
-    except ValueError:
-        return False
-
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -47,6 +32,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    A gRPC servicer that implements the Backend service defined in backend.proto.
    """

+    def _is_float(self, s):
+        """Check if a string can be converted to float."""
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+
    def Health(self, request, context):
        """
        Returns a health check message.
@@ -86,10 +79,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    continue
                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
                
-                if is_float(value):
-                    value = float(value)
-                elif is_int(value):
-                    value = int(value)
+                # Convert numeric values to appropriate types
+                if self._is_float(value):
+                    if float(value).is_integer():
+                        value = int(value)
+                    else:
+                        value = float(value)
                elif value.lower() in ["true", "false"]:
                    value = value.lower() == "true"
                    
--- a/backend/python/mlx/backend.py
+++ b/backend/python/mlx/backend.py
@@ -24,27 +24,20 @@ _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))

-def is_float(s):
-    """Check if a string can be converted to float."""
-    try:
-        float(s)
-        return True
-    except ValueError:
-        return False
-def is_int(s):
-    """Check if a string can be converted to int."""
-    try:
-        int(s)
-        return True
-    except ValueError:
-        return False
-
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    A gRPC servicer that implements the Backend service defined in backend.proto.
    """

+    def _is_float(self, s):
+        """Check if a string can be converted to float."""
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+
    def Health(self, request, context):
        """
        Returns a health check message.
@@ -85,10 +78,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
                
                # Convert numeric values to appropriate types
-                if is_float(value):
-                    value = float(value)
-                elif is_int(value):
-                    value = int(value)
+                if self._is_float(value):
+                    if float(value).is_integer():
+                        value = int(value)
+                    else:
+                        value = float(value)
                elif value.lower() in ["true", "false"]:
                    value = value.lower() == "true"
                    
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.75.0
+grpcio==1.74.0
 protobuf==6.32.0
 certifi
 setuptools
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v3.5.4"
+  "version": "v3.5.0"
 }
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -335,7 +335,7 @@
  url: "github:mudler/LocalAI/gallery/qwen-image.yaml@master"
  urls:
    - https://huggingface.co/Qwen/Qwen-Image-Edit
-  icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png
+  icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_logo.png
  license: apache-2.0
  tags:
    - qwen-image
@@ -350,26 +350,6 @@
      cuda: true
      pipeline_type: QwenImageEditPipeline
      enable_parameters: num_inference_steps,image
- !!merge <<: *qwenimage
-  name: "qwen-image-edit-2509"
-  url: "github:mudler/LocalAI/gallery/qwen-image.yaml@master"
-  urls:
-    - https://huggingface.co/Qwen/Qwen-Image-Edit-2509
-  icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png
-  license: apache-2.0
-  tags:
-    - qwen-image
-    - gpu
-    - image-to-image
-  description: |
-    Qwen-Image-Edit is a model for image editing, which is based on Qwen-Image.
-  overrides:
-    parameters:
-      model: Qwen/Qwen-Image-Edit-2509
-    diffusers:
-      cuda: true
-      pipeline_type: QwenImageEditPipeline
-      enable_parameters: num_inference_steps,image
 - &gptoss
  name: "gpt-oss-20b"
  url: "github:mudler/LocalAI/gallery/harmony.yaml@master"
@@ -2658,39 +2638,6 @@
    - filename: Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf
      sha256: 1afefb3b369ea2de191f24fe8ea22cbbb7b412357902f27bd81d693dde35c2d9
      uri: huggingface://bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf
- !!merge <<: *qwen3
-  name: "impish_qwen_14b-1m"
-  icon: https://huggingface.co/SicariusSicariiStuff/Impish_QWEN_14B-1M/resolve/main/Images/Impish_Qwen_14B.png
-  urls:
-    - https://huggingface.co/SicariusSicariiStuff/Impish_QWEN_14B-1M
-    - https://huggingface.co/mradermacher/Impish_QWEN_14B-1M-GGUF
-  description: |
-    Supreme context One million tokens to play with.
-    Strong Roleplay internet RP format lovers will appriciate it, medium size paragraphs.
-    Qwen smarts built-in, but naughty and playful Maybe it's even too naughty.
-    VERY compliant with low censorship.
-    VERY high IFeval for a 14B RP model: 78.68.
-  overrides:
-    parameters:
-      model: Impish_QWEN_14B-1M.Q4_K_M.gguf
-  files:
-    - filename: Impish_QWEN_14B-1M.Q4_K_M.gguf
-      sha256: d326f2b8f05814ea3943c82498f0cd3cde64859cf03f532855c87fb94b0da79e
-      uri: huggingface://mradermacher/Impish_QWEN_14B-1M-GGUF/Impish_QWEN_14B-1M.Q4_K_M.gguf
- !!merge <<: *qwen3
-  name: "aquif-3.5-a4b-think"
-  urls:
-    - https://huggingface.co/aquif-ai/aquif-3.5-A4B-Think
-    - https://huggingface.co/QuantFactory/aquif-3.5-A4B-Think-GGUF
-  description: |
-    The aquif-3.5 series is the successor to aquif-3, featuring a simplified naming scheme, expanded Mixture of Experts (MoE) options, and across-the-board performance improvements. This release streamlines model selection while delivering enhanced capabilities across reasoning, multilingual support, and general intelligence tasks.
-  overrides:
-    parameters:
-      model: aquif-3.5-A4B-Think.Q4_K_M.gguf
-  files:
-    - filename: aquif-3.5-A4B-Think.Q4_K_M.gguf
-      sha256: 1650b72ae1acf12b45a702f2ff5f47205552e494f0d910e81cbe40dfba55a6b9
-      uri: huggingface://QuantFactory/aquif-3.5-A4B-Think-GGUF/aquif-3.5-A4B-Think.Q4_K_M.gguf
 - &gemma3
  url: "github:mudler/LocalAI/gallery/gemma.yaml@master"
  name: "gemma-3-27b-it"
@@ -15228,27 +15175,6 @@
    - filename: Impish_Longtail_12B-Q4_K_M.gguf
      sha256: 2cf0cacb65d71cfc5b4255f3273ad245bbcb11956a0f9e3aaa0e739df57c90df
      uri: huggingface://SicariusSicariiStuff/Impish_Longtail_12B_GGUF/Impish_Longtail_12B-Q4_K_M.gguf
- !!merge <<: *mistral03
-  name: "mistralai_magistral-small-2509"
-  urls:
-    - https://huggingface.co/mistralai/Magistral-Small-2509
-    - https://huggingface.co/bartowski/mistralai_Magistral-Small-2509-GGUF
-  description: |
-    Magistral Small 1.2
-    Building upon Mistral Small 3.2 (2506), with added reasoning capabilities, undergoing SFT from Magistral Medium traces and RL on top, it's a small, efficient reasoning model with 24B parameters.
-
-    Magistral Small can be deployed locally, fitting within a single RTX 4090 or a 32GB RAM MacBook once quantized.
-
-    Learn more about Magistral in our blog post.
-
-    The model was presented in the paper Magistral.
-  overrides:
-    parameters:
-      model: mistralai_Magistral-Small-2509-Q4_K_M.gguf
-  files:
-    - filename: mistralai_Magistral-Small-2509-Q4_K_M.gguf
-      sha256: 1d638bc931de30d29fc73ad439206ff185f76666a096e7ad723866a20f78728d
-      uri: huggingface://bartowski/mistralai_Magistral-Small-2509-GGUF/mistralai_Magistral-Small-2509-Q4_K_M.gguf
 - &mudler
  url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
  name: "LocalAI-llama3-8b-function-call-v0.2"
@@ -20410,9 +20336,9 @@
    - https://huggingface.co/ggerganov/whisper.cpp
  overrides:
    parameters:
-      model: ggml-base.bin
+      model: ggml-whisper-base.bin
  files:
-    - filename: "ggml-base.bin"
+    - filename: "ggml-whisper-base.bin"
      sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
      uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
  description: |
@@ -20457,20 +20383,11 @@
  name: "whisper-large-q5_0"
  overrides:
    parameters:
-      model: ggml-large-v3-q5_0.bin
+      model: ggml-large-q5_0.bin
  files:
-    - filename: "ggml-large-v3-q5_0.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-q5_0.bin"
-      sha256: d75795ecff3f83b5faa89d1900604ad8c780abd5739fae406de19f23ecd98ad1
- !!merge <<: *whisper
-  name: "whisper-medium"
-  overrides:
-    parameters:
-      model: ggml-medium.bin
-  files:
-    - filename: "ggml-medium.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-medium.bin"
-      sha256: 6c14d5adee5f86394037b4e4e8b59f1673b6cee10e3cf0b11bbdbee79c156208
+    - filename: "ggml-large-q5_0.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-q5_0.bin"
+      sha256: 3a214837221e4530dbc1fe8d734f302af393eb30bd0ed046042ebf4baf70f6f2
 - !!merge <<: *whisper
  name: "whisper-medium-q5_0"
  overrides:
@@ -20498,6 +20415,15 @@
    - filename: "ggml-small.bin"
      uri: "huggingface://ggerganov/whisper.cpp/ggml-small.bin"
      sha256: 1be3a9b2063867b937e64e2ec7483364a79917e157fa98c5d94b5c1fffea987b
+- !!merge <<: *whisper
+  name: "whisper-small-en-tdrz"
+  overrides:
+    parameters:
+      model: ggml-small.en-tdrz.bin
+  files:
+    - filename: "ggml-small.bin"
+      uri: "huggingface://akashmjn/tinydiarize-whisper.cpp/ggml-small.en-tdrz.bin"
+      sha256: ceac3ec06d1d98ef71aec665283564631055fd6129b79d8e1be4f9cc33cc54b4
 - !!merge <<: *whisper
  name: "whisper-small-en-q5_1"
  overrides:
@@ -20570,51 +20496,6 @@
    - filename: "ggml-tiny.en-q8_0.bin"
      uri: "huggingface://ggerganov/whisper.cpp/ggml-tiny.en-q8_0.bin"
      sha256: 5bc2b3860aa151a4c6e7bb095e1fcce7cf12c7b020ca08dcec0c6d018bb7dd94
- !!merge <<: *whisper
-  name: "whisper-large"
-  overrides:
-    parameters:
-      model: ggml-large-v3.bin
-  files:
-    - filename: "ggml-large-v3.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3.bin"
-      sha256: 64d182b440b98d5203c4f9bd541544d84c605196c4f7b845dfa11fb23594d1e2
- !!merge <<: *whisper
-  name: "whisper-large-q5_0"
-  overrides:
-    parameters:
-      model: ggml-large-v3-q5_0.bin
-  files:
-    - filename: "ggml-large-v3-q5_0.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-q5_0.bin"
-      sha256: d75795ecff3f83b5faa89d1900604ad8c780abd5739fae406de19f23ecd98ad1
- !!merge <<: *whisper
-  name: "whisper-large-turbo"
-  overrides:
-    parameters:
-      model: ggml-large-v3-turbo.bin
-  files:
-    - filename: "ggml-large-v3-turbo.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-turbo.bin"
-      sha256: 1fc70f774d38eb169993ac391eea357ef47c88757ef72ee5943879b7e8e2bc69
- !!merge <<: *whisper
-  name: "whisper-large-turbo-q5_0"
-  overrides:
-    parameters:
-      model: ggml-large-v3-turbo-q5_0.bin
-  files:
-    - filename: "ggml-large-v3-turbo-q5_0.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-turbo-q5_0.bin"
-      sha256: 394221709cd5ad1f40c46e6031ca61bce88931e6e088c188294c6d5a55ffa7e2
- !!merge <<: *whisper
-  name: "whisper-large-turbo-q8_0"
-  overrides:
-    parameters:
-      model: ggml-large-v3-turbo-q8_0.bin
-  files:
-    - filename: "ggml-large-v3-turbo-q8_0.bin"
-      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-v3-turbo-q8_0.bin"
-      sha256: 317eb69c11673c9de1e1f0d459b253999804ec71ac4c23c17ecf5fbe24e259a1
 ## Bert embeddings (llama3.2 drop-in)
 - !!merge <<: *llama32
  name: "bert-embeddings"
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -95,7 +95,6 @@ var knownModelsNameSuffixToSkip []string = []string{
 	".DS_Store",
 	".",
 	".safetensors",
-	".bin",
 	".partial",
 	".tar.gz",
 }