feat(l4t): add support for extras images

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
chore(model gallery): add suayptalha_maestro-10b (#4760 )
2026-05-24 16:51:44 -04:00 · 2025-02-06 11:53:07 +01:00 · 2025-02-04 09:51:54 +01:00 · 2025-02-04 09:50:18 +01:00 · 2025-02-04 09:45:52 +01:00 · 2025-02-04 08:57:19 +01:00
177 changed files with 2052 additions and 2583 deletions
--- a/.devcontainer/docker-compose-devcontainer.yml
+++ b/.devcontainer/docker-compose-devcontainer.yml
@@ -7,7 +7,7 @@ services:
      args:
      - FFMPEG=true
      - IMAGE_TYPE=extras
-      - GO_TAGS=stablediffusion p2p tts
+      - GO_TAGS=p2p tts
    env_file:
      - ../.env
    ports:
--- a/.env
+++ b/.env
@@ -38,12 +38,12 @@
 ## Uncomment and set to true to enable rebuilding from source
 # REBUILD=true
-## Enable go tags, available: stablediffusion, tts
+## Enable go tags, available: p2p, tts
-## stablediffusion: image generation with stablediffusion
+## p2p: enable distributed inferencing
 ## tts: enables text-to-speech with go-piper 
 ## (requires REBUILD=true)
 #
-# GO_TAGS=stablediffusion
+# GO_TAGS=p2p
 ## Path where to store generated images
 # LOCALAI_IMAGE_PATH=/tmp/generated/images
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.2.0
+        uses: dependabot/fetch-metadata@v2.3.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -18,7 +18,7 @@ jobs:
      with:
        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.7.0
+    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
@@ -99,7 +99,7 @@ jobs:
        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
      # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.7.0
+    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -237,40 +237,7 @@ jobs:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
-  build-stablediffusion:
+
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache upx-ucl
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build stablediffusion
        run: |
          export PATH=$PATH:$GOPATH/bin
          make backend-assets/grpc/stablediffusion
          mkdir -p release && cp backend-assets/grpc/stablediffusion release
        env:
          GO_TAGS: stablediffusion
      - uses: actions/upload-artifact@v4
        with:
          name: stablediffusion
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*
  build-macOS-x86_64:
    runs-on: macos-13
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -78,57 +78,6 @@ jobs:
          make --jobs=5 --output-sync=target -C backend/python/diffusers
          make --jobs=5 --output-sync=target -C backend/python/diffusers test
  tests-parler-tts:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test parler-tts
        run: |
           make --jobs=5 --output-sync=target -C backend/python/parler-tts
           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
  tests-openvoice:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test openvoice
        run: |
           make --jobs=5 --output-sync=target -C backend/python/openvoice
           make --jobs=5 --output-sync=target -C backend/python/openvoice test
  # tests-transformers-musicgen:
  #   runs-on: ubuntu-latest
  #   steps:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -105,9 +105,7 @@ jobs:
          # Pre-build piper before we start tests in order to have shared libraries in place
          make sources/go-piper && \
          GO_TAGS="tts" make -C sources/go-piper piper.o && \
-          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
+          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
        env:
          CUDA_VERSION: 12-4
      - name: Cache grpc
@@ -129,7 +127,7 @@ jobs:
          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
+          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.19
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -26,7 +26,7 @@
                "LOCALAI_P2P": "true",
                "LOCALAI_FEDERATED": "true"
            },
-            "buildFlags": ["-tags", "stablediffusion p2p tts", "-v"],
+            "buildFlags": ["-tags", "p2p tts", "-v"],
            "envFile": "${workspaceFolder}/.env",
            "cwd": "${workspaceRoot}"
        }
--- a/55
+++ b/55
@@ -15,8 +15,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
@@ -69,14 +68,10 @@ ENV PATH=/opt/rocm/bin:${PATH}
 # OpenBLAS requirements and stable diffusion
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        libopenblas-dev \
+        libopenblas-dev && \
        libopencv-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Set up OpenCV
 RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
 WORKDIR /build
 ###################################
@@ -251,7 +246,7 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
 FROM requirements-drivers AS builder-base
-ARG GO_TAGS="stablediffusion tts p2p"
+ARG GO_TAGS="tts p2p"
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS
 ARG LD_FLAGS="-s -w"
@@ -285,35 +280,12 @@ RUN <<EOT bash
    fi
 EOT
 ###################################
 ###################################
 # This first portion of builder holds the layers specifically used to build backend-assets/grpc/stablediffusion
 # In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
 FROM builder-base AS builder-sd
 # stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
 COPY Makefile .
 COPY go.mod .
 COPY go.sum .
 COPY backend/backend.proto ./backend/backend.proto
 COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
 COPY pkg/grpc ./pkg/grpc
 COPY pkg/stablediffusion ./pkg/stablediffusion
 RUN git init
 RUN make sources/go-stable-diffusion
 RUN touch prepare-sources
 # Actually build the backend
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
 ###################################
 ###################################
 # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
-FROM builder-sd AS builder
+FROM builder-base AS builder
 # Install the pre-built GRPC
 COPY --from=grpc /opt/grpc /usr/local
@@ -331,7 +303,7 @@ RUN make prepare
 ## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
 ## (both will use CUDA or hipblas for the actual computation)
 RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
+        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
    fi
@@ -353,8 +325,6 @@ ARG FFMPEG
 COPY --from=grpc /opt/grpc /usr/local
 COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion /build/backend-assets/grpc/stablediffusion
 COPY .devcontainer-scripts /.devcontainer-scripts
 # Add FFmpeg
@@ -384,12 +354,14 @@ FROM requirements-drivers
 ARG FFMPEG
 ARG BUILD_TYPE
 ARG BUILD_PLATFORM
 ARG TARGETARCH
 ARG IMAGE_TYPE=extras
 ARG EXTRA_BACKENDS
 ARG MAKEFLAGS
 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV BUILD_PLATFORM=${BUILD_PLATFORM}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
 ENV MAKEFLAGS=${MAKEFLAGS}
@@ -427,9 +399,6 @@ COPY --from=builder /build/local-ai ./
 # Copy shared libraries for piper
 COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
 # do not let stablediffusion rebuild (requires an older version of absl)
 COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
 # Change the shell to bash so we can use [[ tests below
 SHELL ["/bin/bash", "-c"]
 # We try to strike a balance between individual layer size (as that affects total push time) and total image size
@@ -443,8 +412,8 @@ RUN if [[ ( "${IMAGE_TYPE}" == "extras ")]]; then \
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/coqui \
    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "faster-whisper" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/parler-tts \
+        make -C backend/python/faster-whisper \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/diffusers \
@@ -453,9 +422,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/kokoro \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/openvoice \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/exllama2 \
    ; fi && \
@@ -474,9 +440,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/rerankers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/mamba \
    ; fi
 # Make sure the models directory exists
--- a/88
+++ b/88
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=92bc493917d43b83e592349e138b54c90b1c3ea7
+CPPLLAMA_VERSION?=5598f475be3e31430fbe17ebb85654ec90dc201e
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -18,10 +18,6 @@ WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
 PIPER_REPO?=https://github.com/mudler/go-piper
 PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
 # stablediffusion version
 STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
 STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
 # bark.cpp
 BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
 BARKCPP_VERSION?=v1.0.0
@@ -179,11 +175,6 @@ ifeq ($(STATIC),true)
 	LD_FLAGS+=-linkmode external -extldflags -static
 endif
 ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
 #	OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
 	OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
 endif
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
@@ -195,6 +186,7 @@ endif
 ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
@@ -273,19 +265,6 @@ sources/go-piper:
 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
 ## stable diffusion (onnx)
 sources/go-stable-diffusion:
 	mkdir -p sources/go-stable-diffusion
 	cd sources/go-stable-diffusion && \
 	git init && \
 	git remote add origin $(STABLEDIFFUSION_REPO) && \
 	git fetch origin && \
 	git checkout $(STABLEDIFFUSION_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
 	CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
 ## stablediffusion (ggml)
 sources/stablediffusion-ggml.cpp:
 	git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
@@ -331,20 +310,18 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
-get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp sources/go-stable-diffusion backend/cpp/llama/llama.cpp
+get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
 replace:
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
 prepare-sources: get-sources replace
@@ -355,7 +332,6 @@ rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) build
@@ -470,7 +446,7 @@ prepare-test: grpcs
 test: prepare test-models/testmodel.ggml grpcs
 	@echo 'Running tests'
-	export GO_TAGS="tts stablediffusion debug"
+	export GO_TAGS="tts debug"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
@@ -558,10 +534,10 @@ protogen-go-clean:
 	$(RM) bin/*
 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen transformers-protogen parler-tts-protogen kokoro-protogen vllm-protogen openvoice-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean mamba-protogen-clean rerankers-protogen-clean transformers-protogen-clean parler-tts-protogen-clean kokoro-protogen-clean vllm-protogen-clean openvoice-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -595,6 +571,14 @@ diffusers-protogen:
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean
 .PHONY: faster-whisper-protogen
 faster-whisper-protogen:
 	$(MAKE) -C backend/python/faster-whisper protogen
 .PHONY: faster-whisper-protogen-clean
 faster-whisper-protogen-clean:
 	$(MAKE) -C backend/python/faster-whisper protogen-clean
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
@@ -603,14 +587,6 @@ exllama2-protogen:
 exllama2-protogen-clean:
 	$(MAKE) -C backend/python/exllama2 protogen-clean
 .PHONY: mamba-protogen
 mamba-protogen:
 	$(MAKE) -C backend/python/mamba protogen
 .PHONY: mamba-protogen-clean
 mamba-protogen-clean:
 	$(MAKE) -C backend/python/mamba protogen-clean
 .PHONY: rerankers-protogen
 rerankers-protogen:
 	$(MAKE) -C backend/python/rerankers protogen
@@ -627,14 +603,6 @@ transformers-protogen:
 transformers-protogen-clean:
 	$(MAKE) -C backend/python/transformers protogen-clean
 .PHONY: parler-tts-protogen
 parler-tts-protogen:
 	$(MAKE) -C backend/python/parler-tts protogen
 .PHONY: parler-tts-protogen-clean
 parler-tts-protogen-clean:
 	$(MAKE) -C backend/python/parler-tts protogen-clean
 .PHONY: kokoro-protogen
 kokoro-protogen:
 	$(MAKE) -C backend/python/kokoro protogen
@@ -643,14 +611,6 @@ kokoro-protogen:
 kokoro-protogen-clean:
 	$(MAKE) -C backend/python/kokoro protogen-clean
 .PHONY: openvoice-protogen
 openvoice-protogen:
 	$(MAKE) -C backend/python/openvoice protogen
 .PHONY: openvoice-protogen-clean
 openvoice-protogen-clean:
 	$(MAKE) -C backend/python/openvoice protogen-clean
 .PHONY: vllm-protogen
 vllm-protogen:
 	$(MAKE) -C backend/python/vllm protogen
@@ -666,13 +626,11 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
 	$(MAKE) -C backend/python/faster-whisper
 	$(MAKE) -C backend/python/vllm
 	$(MAKE) -C backend/python/mamba
 	$(MAKE) -C backend/python/rerankers
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/kokoro
 	$(MAKE) -C backend/python/openvoice
 	$(MAKE) -C backend/python/exllama2
 prepare-test-extra: protogen-python
@@ -742,6 +700,13 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
 backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx512
 	$(MAKE) -C backend/cpp/llama-avx512 purge
 	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
 backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx
 	$(MAKE) -C backend/cpp/llama-avx purge
@@ -816,13 +781,6 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/piper
 endif
 backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/stablediffusion
 endif
 backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero
@@ -903,7 +861,7 @@ swagger:
 .PHONY: gen-assets
 gen-assets:
-	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
+	$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
 ## Documentation
 docs/layouts/_default:
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@
 </p>
 <p align="center">
-<a href="https://trendshift.io/repositories/1484" target="_blank"><img src="https://trendshift.io/api/badge/repositories/1484" alt="go-skynet%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </p>
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
--- a/aio/cpu/image-gen.yaml
+++ b/aio/cpu/image-gen.yaml
@@ -1,56 +1,17 @@
 name: stablediffusion
-backend: stablediffusion
+backend: stablediffusion-ggml
 cfg_scale: 4.5
 options:
 - sampler:euler
 parameters:
-  model: stablediffusion_assets
+  model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
-
+step: 25
 license: "BSD-3"
 urls:
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
 description: |
     Stable Diffusion in NCNN with c++, supported txt2img and img2img
 download_files:
- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
+- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
-  sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
+  sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
-  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
+  uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
 - filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
  sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
 - filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
  sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
 - filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
  sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
 - filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
  sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
 - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
  sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
 - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
  sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
 - filename: "stablediffusion_assets/log_sigmas.bin"
  sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
 - filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
  sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
 - filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
  sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
 - filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
  sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
 - filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
  sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
 - filename: "stablediffusion_assets/vocab.txt"
  sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
 usage: |
        curl http://localhost:8080/v1/images/generations \
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -21,7 +21,8 @@ service Backend {
  rpc Status(HealthMessage) returns (StatusResponse) {}
  rpc StoresSet(StoresSetOptions) returns (Result) {}
-  rpc StoresReset(StoresResetOptions) returns (Result) {}
+  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
  rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
  rpc Rerank(RerankRequest) returns (RerankResult) {}
@@ -77,10 +78,19 @@ message StoresSetOptions {
  repeated StoresValue Values = 2;
 }
-message StoresResetOptions {
+message StoresDeleteOptions {
  repeated StoresKey Keys = 1;
 }
 message StoresGetOptions {
  repeated StoresKey Keys = 1;
 }
 message StoresGetResult {
  repeated StoresKey Keys = 1;
  repeated StoresValue Values = 2;
 }
 message StoresFindOptions {
  StoresKey Key = 1;
  int32 TopK = 2;
@@ -153,6 +163,11 @@ message Reply {
  double timing_token_generation = 5;
 }
 message GrammarTrigger {
  string word = 1;
  bool at_start = 2; 
 }
 message ModelOptions {
  string Model = 1;
  int32 ContextSize = 2;
@@ -237,6 +252,8 @@ message ModelOptions {
  string CacheTypeKey = 63;
  string CacheTypeValue = 64;
  repeated GrammarTrigger GrammarTriggers = 65;
 }
 message Result {
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -468,6 +468,9 @@ struct llama_server_context
    bool add_bos_token      = true;
    bool has_eos_token      = true;
    bool grammar_lazy = false;
    std::vector<common_grammar_trigger> grammar_trigger_words;
    int32_t n_ctx;  // total context for all clients / slots
    // system prompt
@@ -706,6 +709,8 @@ struct llama_server_context
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
        slot->sparams.grammar_trigger_words = grammar_trigger_words;
        slot->sparams.grammar_lazy = grammar_lazy;
        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
            // Might be better to reject the request with a 400 ?
@@ -2374,6 +2379,21 @@ static void params_parse(const backend::ModelOptions* request,
    if ( request->ropefreqscale() != 0.0f ) {
        params.rope_freq_scale = request->ropefreqscale();
    }
    if (request->grammartriggers_size() > 0) {
        LOG_INFO("configuring grammar triggers", {});
        llama.grammar_lazy = true;
        for (int i = 0; i < request->grammartriggers_size(); i++) {
            common_grammar_trigger trigger;
            trigger.word = request->grammartriggers(i).word();
            trigger.at_start = request->grammartriggers(i).at_start();
            llama.grammar_trigger_words.push_back(trigger);
            LOG_INFO("grammar trigger", {
                { "word", trigger.word },
                { "at_start", trigger.at_start }
            });
        }
    }
 }
@@ -2522,6 +2542,18 @@ public:
        return grpc::Status::OK;
    }
    grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
         json data = parse_options(false, request, llama);
         std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
         for (int i=0 ; i< tokens.size(); i++){
            response->add_tokens(tokens[i]);
         }
        return grpc::Status::OK;
    }
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();
--- a/backend/go/image/stablediffusion/main.go
+++ b/backend/go/image/stablediffusion/main.go
@@ -1,21 +0,0 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &Image{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/image/stablediffusion/stablediffusion.go
+++ b/backend/go/image/stablediffusion/stablediffusion.go
@@ -1,33 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/stablediffusion"
 )
 type Image struct {
 	base.SingleThread
 	stablediffusion *stablediffusion.StableDiffusion
 }
 func (image *Image) Load(opts *pb.ModelOptions) error {
 	var err error
 	// Note: the Model here is a path to a directory containing the model files
 	image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
 	return err
 }
 func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
 	return image.stablediffusion.GenerateImage(
 		int(opts.Height),
 		int(opts.Width),
 		int(opts.Mode),
 		int(opts.Step),
 		int(opts.Seed),
 		opts.PositivePrompt,
 		opts.NegativePrompt,
 		opts.Dst)
 }
--- a/backend/go/stores/store.go
+++ b/backend/go/stores/store.go
@@ -4,36 +4,101 @@ package main
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"container/heap"
 	"context"
 	"fmt"
 	"math"
-	"runtime"
+	"slices"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	chromem "github.com/philippgille/chromem-go"
 	"github.com/rs/zerolog/log"
 )
 type Store struct {
 	base.SingleThread
-	*chromem.DB
+
-	*chromem.Collection
+	// The sorted keys
 	keys [][]float32
 	// The sorted values
 	values [][]byte
 	// If for every K it holds that ||k||^2 = 1, then we can use the normalized distance functions
 	// TODO: Should we normalize incoming keys if they are not instead?
 	keysAreNormalized bool
 	// The first key decides the length of the keys
 	keyLen int
 }
 // TODO: Only used for sorting using Go's builtin implementation. The interfaces are columnar because
 // that's theoretically best for memory layout and cache locality, but this isn't optimized yet.
 type Pair struct {
 	Key   []float32
 	Value []byte
 }
 func NewStore() *Store {
-	return &Store{}
+	return &Store{
 		keys:              make([][]float32, 0),
 		values:            make([][]byte, 0),
 		keysAreNormalized: true,
 		keyLen:            -1,
 	}
 }
 func compareSlices(k1, k2 []float32) int {
 	assert(len(k1) == len(k2), fmt.Sprintf("compareSlices: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
 	return slices.Compare(k1, k2)
 }
 func hasKey(unsortedSlice [][]float32, target []float32) bool {
 	return slices.ContainsFunc(unsortedSlice, func(k []float32) bool {
 		return compareSlices(k, target) == 0
 	})
 }
 func findInSortedSlice(sortedSlice [][]float32, target []float32) (int, bool) {
 	return slices.BinarySearchFunc(sortedSlice, target, func(k, t []float32) int {
 		return compareSlices(k, t)
 	})
 }
 func isSortedPairs(kvs []Pair) bool {
 	for i := 1; i < len(kvs); i++ {
 		if compareSlices(kvs[i-1].Key, kvs[i].Key) > 0 {
 			return false
 		}
 	}
 	return true
 }
 func isSortedKeys(keys [][]float32) bool {
 	for i := 1; i < len(keys); i++ {
 		if compareSlices(keys[i-1], keys[i]) > 0 {
 			return false
 		}
 	}
 	return true
 }
 func sortIntoKeySlicese(keys []*pb.StoresKey) [][]float32 {
 	ks := make([][]float32, len(keys))
 	for i, k := range keys {
 		ks[i] = k.Floats
 	}
 	slices.SortFunc(ks, compareSlices)
 	assert(len(ks) == len(keys), fmt.Sprintf("len(ks) = %d, len(keys) = %d", len(ks), len(keys)))
 	assert(isSortedKeys(ks), "keys are not sorted")
 	return ks
 }
 func (s *Store) Load(opts *pb.ModelOptions) error {
 	db := chromem.NewDB()
 	collection, err := db.CreateCollection("all-documents", nil, nil)
 	if err != nil {
 		return err
 	}
 	s.DB = db
 	s.Collection = collection
 	return nil
 }
@@ -46,25 +111,156 @@ func (s *Store) StoresSet(opts *pb.StoresSetOptions) error {
 	if len(opts.Keys) != len(opts.Values) {
 		return fmt.Errorf("len(keys) = %d, len(values) = %d", len(opts.Keys), len(opts.Values))
 	}
-	docs := []chromem.Document{}
+
 	if s.keyLen == -1 {
 		s.keyLen = len(opts.Keys[0].Floats)
 	} else {
 		if len(opts.Keys[0].Floats) != s.keyLen {
 			return fmt.Errorf("Try to add key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
 		}
 	}
 	kvs := make([]Pair, len(opts.Keys))
 	for i, k := range opts.Keys {
-		docs = append(docs, chromem.Document{
+		if s.keysAreNormalized && !isNormalized(k.Floats) {
-			ID:      k.String(),
+			s.keysAreNormalized = false
-			Content: opts.Values[i].String(),
+			var sample []float32
-		})
+			if len(s.keys) > 5 {
 				sample = k.Floats[:5]
 			} else {
 				sample = k.Floats
 			}
 			log.Debug().Msgf("Key is not normalized: %v", sample)
 		}
 		kvs[i] = Pair{
 			Key:   k.Floats,
 			Value: opts.Values[i].Bytes,
 		}
 	}
-	return s.Collection.AddDocuments(context.Background(), docs, runtime.NumCPU())
+	slices.SortFunc(kvs, func(a, b Pair) int {
 		return compareSlices(a.Key, b.Key)
 	})
 	assert(len(kvs) == len(opts.Keys), fmt.Sprintf("len(kvs) = %d, len(opts.Keys) = %d", len(kvs), len(opts.Keys)))
 	assert(isSortedPairs(kvs), "keys are not sorted")
 	l := len(kvs) + len(s.keys)
 	merge_ks := make([][]float32, 0, l)
 	merge_vs := make([][]byte, 0, l)
 	i, j := 0, 0
 	for {
 		if i+j >= l {
 			break
 		}
 		if i >= len(kvs) {
 			merge_ks = append(merge_ks, s.keys[j])
 			merge_vs = append(merge_vs, s.values[j])
 			j++
 			continue
 		}
 		if j >= len(s.keys) {
 			merge_ks = append(merge_ks, kvs[i].Key)
 			merge_vs = append(merge_vs, kvs[i].Value)
 			i++
 			continue
 		}
 		c := compareSlices(kvs[i].Key, s.keys[j])
 		if c < 0 {
 			merge_ks = append(merge_ks, kvs[i].Key)
 			merge_vs = append(merge_vs, kvs[i].Value)
 			i++
 		} else if c > 0 {
 			merge_ks = append(merge_ks, s.keys[j])
 			merge_vs = append(merge_vs, s.values[j])
 			j++
 		} else {
 			merge_ks = append(merge_ks, kvs[i].Key)
 			merge_vs = append(merge_vs, kvs[i].Value)
 			i++
 			j++
 		}
 	}
 	assert(len(merge_ks) == l, fmt.Sprintf("len(merge_ks) = %d, l = %d", len(merge_ks), l))
 	assert(isSortedKeys(merge_ks), "merge keys are not sorted")
 	s.keys = merge_ks
 	s.values = merge_vs
 	return nil
 }
-func (s *Store) StoresReset(opts *pb.StoresResetOptions) error {
+func (s *Store) StoresDelete(opts *pb.StoresDeleteOptions) error {
-	err := s.DB.DeleteCollection("all-documents")
+	if len(opts.Keys) == 0 {
-	if err != nil {
+		return fmt.Errorf("no keys to delete")
 		return err
 	}
-	s.Collection, err = s.CreateCollection("all-documents", nil, nil)
+
-	return err
+	if len(opts.Keys) == 0 {
 		return fmt.Errorf("no keys to add")
 	}
 	if s.keyLen == -1 {
 		s.keyLen = len(opts.Keys[0].Floats)
 	} else {
 		if len(opts.Keys[0].Floats) != s.keyLen {
 			return fmt.Errorf("Trying to delete key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
 		}
 	}
 	ks := sortIntoKeySlicese(opts.Keys)
 	l := len(s.keys) - len(ks)
 	merge_ks := make([][]float32, 0, l)
 	merge_vs := make([][]byte, 0, l)
 	tail_ks := s.keys
 	tail_vs := s.values
 	for _, k := range ks {
 		j, found := findInSortedSlice(tail_ks, k)
 		if found {
 			merge_ks = append(merge_ks, tail_ks[:j]...)
 			merge_vs = append(merge_vs, tail_vs[:j]...)
 			tail_ks = tail_ks[j+1:]
 			tail_vs = tail_vs[j+1:]
 		} else {
 			assert(!hasKey(s.keys, k), fmt.Sprintf("Key exists, but was not found: t=%d, %v", len(tail_ks), k))
 		}
 		log.Debug().Msgf("Delete: found = %v, t = %d, j = %d, len(merge_ks) = %d, len(merge_vs) = %d", found, len(tail_ks), j, len(merge_ks), len(merge_vs))
 	}
 	merge_ks = append(merge_ks, tail_ks...)
 	merge_vs = append(merge_vs, tail_vs...)
 	assert(len(merge_ks) <= len(s.keys), fmt.Sprintf("len(merge_ks) = %d, len(s.keys) = %d", len(merge_ks), len(s.keys)))
 	s.keys = merge_ks
 	s.values = merge_vs
 	assert(len(s.keys) >= l, fmt.Sprintf("len(s.keys) = %d, l = %d", len(s.keys), l))
 	assert(isSortedKeys(s.keys), "keys are not sorted")
 	assert(func() bool {
 		for _, k := range ks {
 			if _, found := findInSortedSlice(s.keys, k); found {
 				return false
 			}
 		}
 		return true
 	}(), "Keys to delete still present")
 	if len(s.keys) != l {
 		log.Debug().Msgf("Delete: Some keys not found: len(s.keys) = %d, l = %d", len(s.keys), l)
 	}
 	return nil
 }
 func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error) {
@@ -115,12 +311,16 @@ func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error)
 }
 func isNormalized(k []float32) bool {
-	var sum float32
+	var sum float64
 	for _, v := range k {
-		sum += v
+		v64 := float64(v)
 		sum += v64*v64
 	}
-	return sum == 1.0
+	s := math.Sqrt(sum)
 	return s >= 0.99 && s <= 1.01
 }
 // TODO: This we could replace with handwritten SIMD code
@@ -132,7 +332,7 @@ func normalizedCosineSimilarity(k1, k2 []float32) float32 {
 		dot += k1[i] * k2[i]
 	}
-	assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
+	assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("dot = %f", dot))
 	// 2.0 * (1.0 - dot) would be the Euclidean distance
 	return dot
@@ -222,7 +422,7 @@ func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
 	sim := float32(dot / (mag1 * math.Sqrt(mag2)))
-	assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
+	assert(sim >= -1.01 && sim <= 1.01, fmt.Sprintf("sim = %f", sim))
 	return sim
 }
--- a/backend/python/autogptq/requirements-l4t.txt
+++ b/backend/python/autogptq/requirements-l4t.txt
@@ -0,0 +1,2 @@
 --index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
 torch
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements-l4t.txt
+++ b/backend/python/bark/requirements-l4t.txt
@@ -0,0 +1,5 @@
 --index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
 torch
 torchaudio
 transformers
 accelerate
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -132,11 +132,16 @@ function installRequirements() {
    declare -a requirementFiles=(
        "${EDIR}/requirements-install.txt"
        "${EDIR}/requirements.txt"
        "${EDIR}/requirements-${BUILD_TYPE}.txt"
    )
-    if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
+    if [ -n "${BUILD_PLATFORM}" ]; then
-        requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}.txt")
+        requirementFiles+=("${EDIR}/requirements-${BUILD_PLATFORM}.txt")
    else
        requirementFiles+=("${EDIR}/requirements-${BUILD_TYPE}.txt")
        if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
            requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}.txt")
        fi
    fi
    # if BUILD_TYPE is empty, we are a CPU build, so we should try to install the CPU requirements
@@ -146,8 +151,14 @@ function installRequirements() {
    requirementFiles+=("${EDIR}/requirements-after.txt")
-    if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
+    if [ -n "${BUILD_PLATFORM}" ]; then
-        requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}-after.txt")
+        requirementFiles+=("${EDIR}/requirements-${BUILD_PLATFORM}-after.txt")
    else
        if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
            requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}-after.txt")
        else
            requirementFiles+=("${EDIR}/requirements-${BUILD_TYPE}-after.txt")
        fi
    fi
    for reqFile in ${requirementFiles[@]}; do
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 grpcio-tools
--- a/backend/python/coqui/requirements-l4t.txt
+++ b/backend/python/coqui/requirements-l4t.txt
@@ -0,0 +1,6 @@
 --index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
 torch
 torchaudio
 transformers
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/requirements-l4t.txt
+++ b/backend/python/diffusers/requirements-l4t.txt
@@ -0,0 +1,10 @@
 --index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
 torch
 diffusers
 opencv-python
 transformers
 accelerate
 compel
 peft
 sentencepiece
 optimum-quanto
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.69.0
+grpcio==1.70.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements-l4t.txt
+++ b/backend/python/exllama2/requirements-l4t.txt
@@ -0,0 +1,4 @@
 --index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
 torch
 transformers
 accelerate
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
 wheel
--- a/backend/python/faster-whisper/Makefile
+++ b/backend/python/faster-whisper/Makefile
@@ -1,8 +1,9 @@
 .DEFAULT_GOAL := install
 .PHONY: install
-install: protogen
+install:
 	bash install.sh
 	$(MAKE) protogen
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
@@ -12,14 +13,8 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+	bash protogen.sh
 .PHONY: clean
 clean: protogen-clean
-	rm -rf venv __pycache__
+	rm -rf venv __pycache__
 .PHONY: test
 test: protogen
 	@echo "Testing openvoice..."
 	bash test.sh
 	@echo "openvoice tested."
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@@ -0,0 +1,94 @@
 #!/usr/bin/env python3
 """
 This is an extra gRPC server of LocalAI for Bark TTS
 """
 from concurrent import futures
 import time
 import argparse
 import signal
 import sys
 import os
 import backend_pb2
 import backend_pb2_grpc
 from faster_whisper import WhisperModel
 import grpc
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', None)
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    BackendServicer is the class that implements the gRPC service
    """
    def Health(self, request, context):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        device = "cpu"
        # Get device
        # device = "cuda" if request.CUDA else "cpu"
        if request.CUDA:
            device = "cuda"
        try:
            print("Preparing models, please wait", file=sys.stderr)
            self.model = WhisperModel(request.Model, device=device, compute_type="float16")
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
        # Replace this with your desired response
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def AudioTranscription(self, request, context):
        resultSegments = []
        text = ""
        try:
            segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False)
            id = 0
            for segment in segments:
                print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
                resultSegments.append(backend_pb2.TranscriptSegment(id=id, start=segment.start, end=segment.end, text=segment.text))
                text += segment.text
                id += 1            
        except Exception as err:
            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
        return backend_pb2.TranscriptResult(segments=resultSegments, text=text)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    serve(args.addr)
--- a/backend/python/faster-whisper/install.sh
+++ b/backend/python/faster-whisper/install.sh
@@ -12,5 +12,3 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
 fi
 installRequirements
 python -m unidic download
--- a/backend/python/faster-whisper/protogen.sh
+++ b/backend/python/faster-whisper/protogen.sh
--- a/backend/python/faster-whisper/requirements-cpu.txt
+++ b/backend/python/faster-whisper/requirements-cpu.txt
@@ -0,0 +1,8 @@
 faster-whisper
 opencv-python
 accelerate
 compel
 peft
 sentencepiece
 torch==2.4.1
 optimum-quanto
--- a/backend/python/faster-whisper/requirements-cublas11.txt
+++ b/backend/python/faster-whisper/requirements-cublas11.txt
@@ -0,0 +1,9 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 faster-whisper
 opencv-python
 accelerate
 compel
 peft
 sentencepiece
 optimum-quanto
--- a/backend/python/faster-whisper/requirements-cublas12.txt
+++ b/backend/python/faster-whisper/requirements-cublas12.txt
@@ -0,0 +1,8 @@
 torch==2.4.1
 faster-whisper
 opencv-python
 accelerate
 compel
 peft
 sentencepiece
 optimum-quanto
--- a/backend/python/faster-whisper/requirements-hipblas.txt
+++ b/backend/python/faster-whisper/requirements-hipblas.txt
@@ -0,0 +1,3 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch
 faster-whisper
--- a/backend/python/faster-whisper/requirements-intel.txt
+++ b/backend/python/faster-whisper/requirements-intel.txt
@@ -1,8 +1,6 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
 torchaudio==2.3.1+cxx11.abi
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-transformers
+faster-whisper
 accelerate
--- a/backend/python/faster-whisper/requirements-l4t.txt
+++ b/backend/python/faster-whisper/requirements-l4t.txt
@@ -0,0 +1,9 @@
 --index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
 torch
 faster-whisper
 opencv-python
 accelerate
 compel
 peft
 sentencepiece
 optimum-quanto
--- a/backend/python/faster-whisper/requirements.txt
+++ b/backend/python/faster-whisper/requirements.txt
@@ -0,0 +1,3 @@
 grpcio==1.70.0
 protobuf
 grpcio-tools
--- a/backend/python/faster-whisper/run.sh
+++ b/backend/python/faster-whisper/run.sh
--- a/backend/python/faster-whisper/test.sh
+++ b/backend/python/faster-whisper/test.sh
--- a/backend/python/kokoro/requirements-l4t.txt
+++ b/backend/python/kokoro/requirements-l4t.txt
@@ -0,0 +1,3 @@
 --index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
 torch
 transformers
--- a/backend/python/kokoro/requirements.txt
+++ b/backend/python/kokoro/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 phonemizer
 scipy
--- a/backend/python/mamba/Makefile
+++ b/backend/python/mamba/Makefile
@@ -1,29 +0,0 @@
 .PHONY: mamba
 mamba: protogen
 	bash install.sh 
 .PHONY: run
 run: protogen
 	@echo "Running mamba..."
 	bash run.sh
 	@echo "mamba run."
 .PHONY: test
 test: protogen
 	@echo "Testing mamba..."
 	bash test.sh
 	@echo "mamba tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
 .PHONY: clean
 clean: protogen-clean
 	$(RM) -r venv __pycache__
--- a/backend/python/mamba/README.md
+++ b/backend/python/mamba/README.md
@@ -1,5 +0,0 @@
 # Creating a separate environment for the mamba project
 ```
 make mamba
 ```
--- a/backend/python/mamba/backend.py
+++ b/backend/python/mamba/backend.py
@@ -1,179 +0,0 @@
 #!/usr/bin/env python3
 from concurrent import futures
 import time
 import argparse
 import signal
 import sys
 import os
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 MAMBA_CHAT= os.environ.get('MAMBA_CHAT', '1') == '1'
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    A gRPC servicer that implements the Backend service defined in backend.proto.
    """
    def generate(self,prompt, max_new_tokens):
        """
        Generates text based on the given prompt and maximum number of new tokens.
        Args:
            prompt (str): The prompt to generate text from.
            max_new_tokens (int): The maximum number of new tokens to generate.
        Returns:
            str: The generated text.
        """
        self.generator.end_beam_search()
        # Tokenizing the input
        ids = self.generator.tokenizer.encode(prompt)
        self.generator.gen_begin_reuse(ids)
        initial_len = self.generator.sequence[0].shape[0]
        has_leading_space = False
        decoded_text = ''
        for i in range(max_new_tokens):
            token = self.generator.gen_single_token()
            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
                has_leading_space = True
            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
            if has_leading_space:
                decoded_text = ' ' + decoded_text
            if token.item() == self.generator.tokenizer.eos_token_id:
                break
        return decoded_text
    def Health(self, request, context):
        """
        Returns a health check message.
        Args:
            request: The health check request.
            context: The gRPC context.
        Returns:
            backend_pb2.Reply: The health check reply.
        """
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        """
        Loads a language model.
        Args:
            request: The load model request.
            context: The gRPC context.
        Returns:
            backend_pb2.Result: The load model result.
        """
        try:
            tokenizerModel = request.Tokenizer
            if tokenizerModel == "":
                tokenizerModel = request.Model
            tokenizer = AutoTokenizer.from_pretrained(tokenizerModel)
            if MAMBA_CHAT:
                tokenizer.eos_token = "<|endoftext|>"
                tokenizer.pad_token = tokenizer.eos_token
            self.tokenizer = tokenizer
            self.model = MambaLMHeadModel.from_pretrained(request.Model, device="cuda", dtype=torch.float16)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def Predict(self, request, context):
        """
        Generates text based on the given prompt and sampling parameters.
        Args:
            request: The predict request.
            context: The gRPC context.
        Returns:
            backend_pb2.Result: The predict result.
        """
        if request.TopP == 0:
            request.TopP = 0.9
        max_tokens = request.Tokens
        if request.Tokens == 0:
            max_tokens = 2000
        # encoded_input = self.tokenizer(request.Prompt)
        tokens = self.tokenizer(request.Prompt, return_tensors="pt")
        input_ids = tokens.input_ids.to(device="cuda")
        out = self.model.generate(input_ids=input_ids, max_length=max_tokens, temperature=request.Temperature,
                                     top_p=request.TopP, eos_token_id=self.tokenizer.eos_token_id)
        decoded = self.tokenizer.batch_decode(out)
        generated_text = decoded[0]
        # Remove prompt from response if present
        if request.Prompt in generated_text:
            generated_text = generated_text.replace(request.Prompt, "")
        return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
    def PredictStream(self, request, context):
        """
        Generates text based on the given prompt and sampling parameters, and streams the results.
        Args:
            request: The predict stream request.
            context: The gRPC context.
        Returns:
            backend_pb2.Result: The predict stream result.
        """
        yield self.Predict(request, context)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    serve(args.addr)
--- a/backend/python/mamba/install.sh
+++ b/backend/python/mamba/install.sh
@@ -1,9 +0,0 @@
 #!/bin/bash
 set -e
 LIMIT_TARGETS="cublas"
 EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
 source $(dirname $0)/../common/libbackend.sh
 installRequirements
--- a/backend/python/mamba/requirements-after.txt
+++ b/backend/python/mamba/requirements-after.txt
@@ -1,2 +0,0 @@
 causal-conv1d==1.4.0
 mamba-ssm==2.2.2
--- a/backend/python/mamba/requirements-cpu.txt
+++ b/backend/python/mamba/requirements-cpu.txt
@@ -1,2 +0,0 @@
 torch==2.4.1
 transformers
--- a/backend/python/mamba/requirements-cublas11.txt
+++ b/backend/python/mamba/requirements-cublas11.txt
@@ -1,3 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 transformers
--- a/backend/python/mamba/requirements-cublas12.txt
+++ b/backend/python/mamba/requirements-cublas12.txt
@@ -1,2 +0,0 @@
 torch==2.4.1
 transformers
--- a/backend/python/mamba/requirements-install.txt
+++ b/backend/python/mamba/requirements-install.txt
@@ -1,6 +0,0 @@
 # mabma does not specify it's build dependencies per PEP517, so we need to disable build isolation
 # this also means that we need to install the basic build dependencies into the venv ourselves
 # https://github.com/Dao-AILab/causal-conv1d/issues/24
 packaging
 setuptools
 wheel
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +0,0 @@
 grpcio==1.69.0
 protobuf
 certifi
--- a/backend/python/mamba/run.sh
+++ b/backend/python/mamba/run.sh
@@ -1,6 +0,0 @@
 #!/bin/bash
 LIMIT_TARGETS="cublas"
 source $(dirname $0)/../common/libbackend.sh
 startBackend $@
--- a/backend/python/mamba/test.py
+++ b/backend/python/mamba/test.py
@@ -1,76 +0,0 @@
 import unittest
 import subprocess
 import time
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 import unittest
 import subprocess
 import time
 import grpc
 import backend_pb2_grpc
 import backend_pb2
 class TestBackendServicer(unittest.TestCase):
    """
    TestBackendServicer is the class that tests the gRPC service.
    This class contains methods to test the startup and shutdown of the gRPC service.
    """
    def setUp(self):
        self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"])
        time.sleep(10)
    def tearDown(self) -> None:
        self.service.terminate()
        self.service.wait()
    def test_server_startup(self):
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.Health(backend_pb2.HealthMessage())
                self.assertEqual(response.message, b'OK')
        except Exception as err:
            print(err)
            self.fail("Server failed to start")
        finally:
            self.tearDown()
    def test_load_model(self):
        """
        This method tests if the model is loaded successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
                self.assertTrue(response.success)
                self.assertEqual(response.message, "Model loaded successfully")
        except Exception as err:
            print(err)
            self.fail("LoadModel service failed")
        finally:
            self.tearDown()
    def test_text(self):
        """
        This method tests if the embeddings are generated successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
                self.assertTrue(response.success)
                req = backend_pb2.PredictOptions(Prompt="The capital of France is")
                resp = stub.Predict(req)
                self.assertIsNotNone(resp.message)
        except Exception as err:
            print(err)
            self.fail("text service failed")
        finally:
            self.tearDown()
--- a/backend/python/openvoice/backend.py
+++ b/backend/python/openvoice/backend.py
@@ -1,158 +0,0 @@
 #!/usr/bin/env python3
 """
 Extra gRPC server for OpenVoice models.
 """
 from concurrent import futures
 import argparse
 import signal
 import sys
 import os
 import torch
 from openvoice import se_extractor
 from openvoice.api import ToneColorConverter
 from melo.api import TTS
 import time
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    A gRPC servicer for the backend service.
    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
    """
    def Health(self, request, context):
        """
        A gRPC method that returns the health status of the backend service.
        Args:
            request: A HealthRequest object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A Reply object that contains the health status of the backend service.
        """
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        """
        A gRPC method that loads a model into memory.
        Args:
            request: A LoadModelRequest object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A Result object that contains the result of the LoadModel operation.
        """
        model_name = request.Model
        try:
            self.clonedVoice = False
            # Assume directory from request.ModelFile.
            # Only if request.LoraAdapter it's not an absolute path
            if request.AudioPath and request.ModelFile != "" and not os.path.isabs(request.AudioPath):
                # get base path of modelFile
                modelFileBase = os.path.dirname(request.ModelFile)
                request.AudioPath = os.path.join(modelFileBase, request.AudioPath)
            if request.AudioPath != "":
                self.clonedVoice = True
            self.modelpath = request.ModelFile
            self.speaker = request.Type
            self.ClonedVoicePath = request.AudioPath
            ckpt_converter = request.Model+'/converter'
            device = "cuda:0" if torch.cuda.is_available() else "cpu"
            self.device = device
            self.tone_color_converter = None
            if self.clonedVoice:
                self.tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
                self.tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def TTS(self, request, context):
        model_name = request.model
        if model_name == "":
            return backend_pb2.Result(success=False, message="request.model is required")
        try:
            # Speed is adjustable
            speed = 1.0
            voice = "EN"
            if request.voice:
                voice = request.voice
            model = TTS(language=voice, device=self.device)
            speaker_ids = model.hps.data.spk2id
            speaker_key = self.speaker
            modelpath = self.modelpath
            for s in speaker_ids.keys():
                print(f"Speaker: {s} - ID: {speaker_ids[s]}")
            speaker_id = speaker_ids[speaker_key]
            speaker_key = speaker_key.lower().replace('_', '-')
            source_se = torch.load(f'{modelpath}/base_speakers/ses/{speaker_key}.pth', map_location=self.device)
            model.tts_to_file(request.text, speaker_id, request.dst, speed=speed)
            if self.clonedVoice:
                reference_speaker = self.ClonedVoicePath
                target_se, audio_name = se_extractor.get_se(reference_speaker, self.tone_color_converter, vad=False)
                # Run the tone color converter
                encode_message = "@MyShell"
                self.tone_color_converter.convert(
                    audio_src_path=request.dst, 
                    src_se=source_se, 
                    tgt_se=target_se, 
                    output_path=request.dst,
                    message=encode_message)
            print("[OpenVoice] TTS generated!", file=sys.stderr)
            print("[OpenVoice] TTS saved to", request.dst, file=sys.stderr)
            print(request, file=sys.stderr)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(success=True)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("[OpenVoice] Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("[OpenVoice] Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    print(f"[OpenVoice] startup: {args}", file=sys.stderr)
    serve(args.addr)
--- a/backend/python/openvoice/requirements-cpu.txt
+++ b/backend/python/openvoice/requirements-cpu.txt
@@ -1,7 +0,0 @@
 torch==2.4.1
 git+https://github.com/myshell-ai/MeloTTS.git
 git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-cublas11.txt
+++ b/backend/python/openvoice/requirements-cublas11.txt
@@ -1,8 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 git+https://github.com/myshell-ai/MeloTTS.git
 git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-cublas12.txt
+++ b/backend/python/openvoice/requirements-cublas12.txt
@@ -1,7 +0,0 @@
 torch==2.4.1
 git+https://github.com/myshell-ai/MeloTTS.git
 git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-hipblas.txt
+++ b/backend/python/openvoice/requirements-hipblas.txt
@@ -1,8 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.4.1+rocm6.0
 git+https://github.com/myshell-ai/MeloTTS.git
 git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -1,24 +0,0 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
 torchaudio==2.3.1+cxx11.abi
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 grpcio==1.69.0
 protobuf
 librosa==0.9.1
 faster-whisper==0.9.0
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
 inflect==7.0.0
 unidecode==1.3.7
 whisper-timestamped==1.14.2
 openai
 python-dotenv
 pypinyin==0.50.0
 cn2an==0.5.22
 jieba==0.42.1
 langid==1.1.6
 git+https://github.com/myshell-ai/MeloTTS.git
 git+https://github.com/myshell-ai/OpenVoice.git
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,17 +0,0 @@
 grpcio==1.69.0
 protobuf
 librosa
 faster-whisper
 inflect
 unidecode
 openai
 python-dotenv
 pypinyin
 cn2an==0.5.22
 numpy==1.22.0
 networkx==2.8.8
 jieba==0.42.1
 gradio==5.9.1
 langid==1.1.6
 llvmlite==0.43.0
 setuptools
--- a/backend/python/openvoice/test.py
+++ b/backend/python/openvoice/test.py
@@ -1,82 +0,0 @@
 """
 A test script to test the gRPC service
 """
 import unittest
 import subprocess
 import time
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 class TestBackendServicer(unittest.TestCase):
    """
    TestBackendServicer is the class that tests the gRPC service
    """
    def setUp(self):
        """
        This method sets up the gRPC service by starting the server
        """
        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
        time.sleep(30)
    def tearDown(self) -> None:
        """
        This method tears down the gRPC service by terminating the server
        """
        self.service.terminate()
        self.service.wait()
    def test_server_startup(self):
        """
        This method tests if the server starts up successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.Health(backend_pb2.HealthMessage())
                self.assertEqual(response.message, b'OK')
        except Exception as err:
            print(err)
            self.fail("Server failed to start")
        finally:
            self.tearDown()
    def test_load_model(self):
        """
        This method tests if the model is loaded successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="checkpoints_v2", 
                                                                    Type="en-us"))
                self.assertTrue(response.success)
                self.assertEqual(response.message, "Model loaded successfully")
        except Exception as err:
            print(err)
            self.fail("LoadModel service failed")
        finally:
            self.tearDown()
    def test_tts(self):
        """
        This method tests if the embeddings are generated successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="dingzhen"))
                self.assertTrue(response.success)
                tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story", voice="EN")
                tts_response = stub.TTS(tts_request)
                self.assertIsNotNone(tts_response)
        except Exception as err:
            print(err)
            self.fail("TTS service failed")
        finally:
            self.tearDown()
--- a/backend/python/openvoice/test.sh
+++ b/backend/python/openvoice/test.sh
@@ -1,12 +0,0 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 # Download checkpoints if not present
 if [ ! -d "checkpoints_v2" ]; then
    wget https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip -O checkpoints_v2.zip
    unzip checkpoints_v2.zip
 fi
 runUnittests
--- a/backend/python/parler-tts/Makefile
+++ b/backend/python/parler-tts/Makefile
@@ -1,44 +0,0 @@
 export CONDA_ENV_PATH = "parler.yml"
 SKIP_CONDA?=0
 ifeq ($(BUILD_TYPE), cublas)
 export CONDA_ENV_PATH = "parler-nvidia.yml"
 endif
 # Intel GPU are supposed to have dependencies installed in the main python
 # environment, so we skip conda installation for SYCL builds.
 # https://github.com/intel/intel-extension-for-pytorch/issues/538
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 export SKIP_CONDA=1
 endif
 .PHONY: parler-tts
 parler-tts:
 	@echo "Installing $(CONDA_ENV_PATH)..."
 	bash install.sh $(CONDA_ENV_PATH)
 	$(MAKE) protogen
 .PHONY: run
 run: protogen
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."
 .PHONY: test
 test: protogen
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	bash protogen.sh
 .PHONY: clean
 clean: protogen-clean
 	$(RM) -r venv __pycache__
--- a/backend/python/parler-tts/backend.py
+++ b/backend/python/parler-tts/backend.py
@@ -1,125 +0,0 @@
 #!/usr/bin/env python3
 """
 Extra gRPC server for MusicgenForConditionalGeneration models.
 """
 from concurrent import futures
 import argparse
 import signal
 import sys
 import os
 import time
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 from scipy.io.wavfile import write as write_wav
 from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer
 import soundfile as sf  
 import torch
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    A gRPC servicer for the backend service.
    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
    """
    def Health(self, request, context):
        """
        A gRPC method that returns the health status of the backend service.
        Args:
            request: A HealthRequest object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A Reply object that contains the health status of the backend service.
        """
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        """
        A gRPC method that loads a model into memory.
        Args:
            request: A LoadModelRequest object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A Result object that contains the result of the LoadModel operation.
        """
        model_name = request.Model
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        try:
            self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def TTS(self, request, context):
        model_name = request.model
        voice = request.voice
        if voice == "":
            voice = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
        if model_name == "":
            return backend_pb2.Result(success=False, message="request.model is required")
        try:
            device = "cuda:0" if torch.cuda.is_available() else "cpu"
            input_ids = self.tokenizer(voice, return_tensors="pt").input_ids.to(device)
            prompt_input_ids = self.tokenizer(request.text, return_tensors="pt").input_ids.to(device)
            generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
            audio_arr = generation.cpu().numpy().squeeze()
            print("[parler-tts] TTS generated!", file=sys.stderr)
            sf.write(request.dst, audio_arr, self.model.config.sampling_rate)
            print("[parler-tts] TTS saved to", request.dst, file=sys.stderr)
            print("[parler-tts] TTS for", file=sys.stderr)
            print(request, file=sys.stderr)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(success=True)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("[parler-tts] Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("[parler-tts] Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    print(f"[parler-tts] startup: {args}", file=sys.stderr)
    serve(args.addr)
--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -1,28 +0,0 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 # This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
 # This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
 # We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
 # the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
 installRequirements
 # https://github.com/descriptinc/audiotools/issues/101
 # incompatible protobuf versions.
 PYDIR=python3.10
 pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
 if [ ! -d ${pyenv} ]; then
    echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
    exit 1
 fi
 curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py
--- a/backend/python/parler-tts/requirements-after.txt
+++ b/backend/python/parler-tts/requirements-after.txt
@@ -1,4 +0,0 @@
 git+https://github.com/huggingface/parler-tts.git@8e465f1b5fcd223478e07175cb40494d19ffbe17
 llvmlite==0.43.0
 numba==0.60.0
 grpcio-tools==1.42.0
--- a/backend/python/parler-tts/requirements-cpu.txt
+++ b/backend/python/parler-tts/requirements-cpu.txt
@@ -1,3 +0,0 @@
 transformers
 accelerate
 torch==2.4.1
--- a/backend/python/parler-tts/requirements-cublas11.txt
+++ b/backend/python/parler-tts/requirements-cublas11.txt
@@ -1,5 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 torchaudio==2.4.1+cu118
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements-cublas12.txt
+++ b/backend/python/parler-tts/requirements-cublas12.txt
@@ -1,4 +0,0 @@
 torch==2.4.1
 torchaudio==2.4.1
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements-hipblas.txt
+++ b/backend/python/parler-tts/requirements-hipblas.txt
@@ -1,5 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.3.0+rocm6.0
 torchaudio==2.3.0+rocm6.0
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,4 +0,0 @@
 grpcio==1.69.0
 certifi
 llvmlite==0.43.0
 setuptools
--- a/backend/python/parler-tts/run.sh
+++ b/backend/python/parler-tts/run.sh
@@ -1,4 +0,0 @@
 #!/bin/bash
 source $(dirname $0)/../common/libbackend.sh
 startBackend $@
--- a/backend/python/parler-tts/test.py
+++ b/backend/python/parler-tts/test.py
@@ -1,81 +0,0 @@
 """
 A test script to test the gRPC service
 """
 import unittest
 import subprocess
 import time
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 class TestBackendServicer(unittest.TestCase):
    """
    TestBackendServicer is the class that tests the gRPC service
    """
    def setUp(self):
        """
        This method sets up the gRPC service by starting the server
        """
        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
        time.sleep(10)
    def tearDown(self) -> None:
        """
        This method tears down the gRPC service by terminating the server
        """
        self.service.terminate()
        self.service.wait()
    def test_server_startup(self):
        """
        This method tests if the server starts up successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.Health(backend_pb2.HealthMessage())
                self.assertEqual(response.message, b'OK')
        except Exception as err:
            print(err)
            self.fail("Server failed to start")
        finally:
            self.tearDown()
    def test_load_model(self):
        """
        This method tests if the model is loaded successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
                self.assertTrue(response.success)
                self.assertEqual(response.message, "Model loaded successfully")
        except Exception as err:
            print(err)
            self.fail("LoadModel service failed")
        finally:
            self.tearDown()
    def test_tts(self):
        """
        This method tests if the embeddings are generated successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
                self.assertTrue(response.success)
                tts_request = backend_pb2.TTSRequest(text="Hey, how are you doing today?")
                tts_response = stub.TTS(tts_request)
                self.assertIsNotNone(tts_response)
        except Exception as err:
            print(err)
            self.fail("TTS service failed")
        finally:
            self.tearDown()
--- a/backend/python/parler-tts/test.sh
+++ b/backend/python/parler-tts/test.sh
@@ -1,6 +0,0 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 runUnittests
--- a/backend/python/rerankers/requirements-l4t.txt
+++ b/backend/python/rerankers/requirements-l4t.txt
@@ -0,0 +1,5 @@
 --index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
 transformers
 accelerate
 torch
 rerankers[transformers]
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -21,7 +21,7 @@ import torch.cuda
 XPU=os.environ.get("XPU", "0") == "1"
-from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria
+from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria, MambaConfig, MambaForCausalLM
 from transformers import AutoProcessor, MusicgenForConditionalGeneration
 from scipy.io import wavfile
 import outetts
@@ -245,6 +245,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                autoTokenizer = False
                self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
                self.SentenceTransformer = True
            elif request.Type == "Mamba":
                autoTokenizer = False
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = MambaForCausalLM.from_pretrained(model_name)
            else:
                print("Automodel", file=sys.stderr)
                self.model = AutoModel.from_pretrained(model_name, 
--- a/backend/python/transformers/requirements-cpu.txt
+++ b/backend/python/transformers/requirements-cpu.txt
@@ -5,4 +5,4 @@ accelerate
 transformers
 bitsandbytes
 outetts
-sentence-transformers==3.3.1
+sentence-transformers==3.4.1
--- a/backend/python/transformers/requirements-cublas11.txt
+++ b/backend/python/transformers/requirements-cublas11.txt
@@ -6,4 +6,4 @@ accelerate
 transformers
 bitsandbytes
 outetts
-sentence-transformers==3.3.1
+sentence-transformers==3.4.1
--- a/backend/python/transformers/requirements-cublas12.txt
+++ b/backend/python/transformers/requirements-cublas12.txt
@@ -5,4 +5,4 @@ numba==0.60.0
 transformers
 bitsandbytes
 outetts
-sentence-transformers==3.3.1
+sentence-transformers==3.4.1
--- a/backend/python/transformers/requirements-hipblas.txt
+++ b/backend/python/transformers/requirements-hipblas.txt
@@ -7,4 +7,4 @@ numba==0.60.0
 bitsandbytes
 outetts
 bitsandbytes
-sentence-transformers==3.3.1
+sentence-transformers==3.4.1
--- a/backend/python/transformers/requirements-intel.txt
+++ b/backend/python/transformers/requirements-intel.txt
@@ -8,4 +8,4 @@ numba==0.60.0
 intel-extension-for-transformers
 bitsandbytes
 outetts
-sentence-transformers==3.3.1
+sentence-transformers==3.4.1
--- a/backend/python/transformers/requirements-l4t.txt
+++ b/backend/python/transformers/requirements-l4t.txt
@@ -0,0 +1,9 @@
 --index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
 torch
 accelerate
 llvmlite==0.43.0
 numba==0.60.0
 transformers
 bitsandbytes
 outetts
 sentence-transformers==3.4.1
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/requirements-after.txt
+++ b/backend/python/vllm/requirements-after.txt
@@ -1 +0,0 @@
 vllm
--- a/backend/python/vllm/requirements-cpu.txt
+++ b/backend/python/vllm/requirements-cpu.txt
@@ -1,3 +1,4 @@
 accelerate
 torch==2.4.1
-transformers
+transformers
 vllm
--- a/backend/python/vllm/requirements-cublas11.txt
+++ b/backend/python/vllm/requirements-cublas11.txt
@@ -2,4 +2,5 @@
 accelerate
 torch==2.4.1+cu118
 transformers
-bitsandbytes
+bitsandbytes
 vllm
--- a/backend/python/vllm/requirements-cublas12.txt
+++ b/backend/python/vllm/requirements-cublas12.txt
@@ -1,4 +1,5 @@
 accelerate
 torch==2.4.1
 transformers
-bitsandbytes
+bitsandbytes
 vllm
--- a/backend/python/vllm/requirements-hipblas.txt
+++ b/backend/python/vllm/requirements-hipblas.txt
@@ -2,4 +2,5 @@
 accelerate
 torch==2.4.1+rocm6.0
 transformers
-bitsandbytes
+bitsandbytes
 vllm
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -6,4 +6,5 @@ transformers
 optimum[openvino]
 setuptools
 bitsandbytes
-oneccl_bind_pt==2.3.100+xpu
+oneccl_bind_pt==2.3.100+xpu
 vllm
--- a/backend/python/vllm/requirements-l4t.txt
+++ b/backend/python/vllm/requirements-l4t.txt
@@ -0,0 +1,7 @@
 --index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
 accelerate
 torch
 vllm
 transformers
 bitsandbytes
 flash-attn
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
 setuptools
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -62,7 +62,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}
 	}
-	if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
+	if err := pkgStartup.InstallModels(options.Galleries, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
 		log.Error().Err(err).Msg("error installing models")
 	}
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -118,9 +118,19 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		nGPULayers = *c.NGPULayers
 	}
 	triggers := make([]*pb.GrammarTrigger, 0)
 	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
 		triggers = append(triggers, &pb.GrammarTrigger{
 			Word:    t.Word,
 			AtStart: t.AtStart,
 		})
 	}
 	return &pb.ModelOptions{
 		CUDA:                 c.CUDA || c.Diffusers.CUDA,
 		SchedulerType:        c.Diffusers.SchedulerType,
 		GrammarTriggers:      triggers,
 		PipelineType:         c.Diffusers.PipelineType,
 		CFGScale:             c.CFGScale,
 		LoraAdapter:          c.LoraAdapter,
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -16,12 +16,7 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
 	opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
-	if backendConfig.Backend == "" {
+	inferenceModel, err = loader.Load(opts...)
 		inferenceModel, err = loader.Load(opts...)
 	} else {
 		opts = append(opts, model.WithBackendString(backendConfig.Backend))
 		inferenceModel, err = loader.Load(opts...)
 	}
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
@@ -35,6 +30,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
 		return schema.TokenizeResponse{}, err
 	}
 	if resp.Tokens == nil {
 		resp.Tokens = make([]int32, 0)
 	}
 	return schema.TokenizeResponse{
 		Tokens: resp.Tokens,
 	}, nil
--- a/core/cli/models.go
+++ b/core/cli/models.go
@@ -100,7 +100,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
 			log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model")
 		}
-		err = startup.InstallModels(galleries, "", mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
+		err = startup.InstallModels(galleries, mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
 		if err != nil {
 			return err
 		}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -32,7 +32,6 @@ type RunCMD struct {
 	Galleries           string   `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"`
 	AutoloadGalleries   bool     `env:"LOCALAI_AUTOLOAD_GALLERIES,AUTOLOAD_GALLERIES" group:"models"`
 	RemoteLibrary       string   `env:"LOCALAI_REMOTE_LIBRARY,REMOTE_LIBRARY" default:"${remoteLibraryURL}" help:"A LocalAI remote library URL" group:"models"`
 	PreloadModels       string   `env:"LOCALAI_PRELOAD_MODELS,PRELOAD_MODELS" help:"A List of models to apply in JSON at start" group:"models"`
 	Models              []string `env:"LOCALAI_MODELS,MODELS" help:"A List of model configuration URLs to load" group:"models"`
 	PreloadModelsConfig string   `env:"LOCALAI_PRELOAD_MODELS_CONFIG,PRELOAD_MODELS_CONFIG" help:"A List of models to apply at startup. Path to a YAML config file" group:"models"`
@@ -90,7 +89,6 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithDynamicConfigDirPollInterval(r.LocalaiConfigDirPollInterval),
 		config.WithF16(r.F16),
 		config.WithStringGalleries(r.Galleries),
 		config.WithModelLibraryURL(r.RemoteLibrary),
 		config.WithCors(r.CORS),
 		config.WithCorsAllowOrigins(r.CORSAllowOrigins),
 		config.WithCsrf(r.CSRF),
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,2 @@`
							`--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/`
							`torch`