ci: do not tag latest on AIO automatically

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Update quickstart.md
2026-02-03 19:22:39 -05:00 · 2024-05-24 09:41:13 +02:00 · 2024-05-24 09:02:59 +02:00 · 2024-05-23 23:51:34 +02:00 · 2024-05-23 21:02:13 +00:00 · 2024-05-23 22:48:12 +02:00
304 changed files with 27603 additions and 3009 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -6,6 +6,11 @@ examples/chatbot-ui/models
 examples/rwkv/models
 examples/**/models
 Dockerfile*
+__pycache__

 # SonarQube
-.scannerwork
+.scannerwork
+
+# backend virtual environments
+**/venv
+backend/python/**/source
--- a/.env
+++ b/.env
@@ -71,6 +71,11 @@
 ### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
 # LLAMACPP_PARALLEL=1

+### Define a list of GRPC Servers for llama-cpp workers to distribute the load
+# https://github.com/ggerganov/llama.cpp/pull/6829
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
+# LLAMACPP_GRPC_SERVERS=""
+
 ### Enable to run parallel requests
 # LOCALAI_PARALLEL_REQUESTS=true

--- a/.github/checksum_checker.sh
+++ b/.github/checksum_checker.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+# This scripts needs yq and huggingface_hub to be installed
+# to install hugingface_hub run pip install huggingface_hub
+
+# Path to the input YAML file
+input_yaml=$1
+
+# Function to download file and check checksum using Python
+function check_and_update_checksum() {
+    model_name="$1"
+    file_name="$2"
+    uri="$3"
+    old_checksum="$4"
+    idx="$5"
+
+    # Download the file and calculate new checksum using Python
+    new_checksum=$(python3 -c "
+import hashlib
+from huggingface_hub import hf_hub_download, get_paths_info
+import requests
+import sys
+import os
+
+uri = '$uri'
+file_name = uri.split('/')[-1]
+
+# Function to parse the URI and determine download method
+# Function to parse the URI and determine download method
+def parse_uri(uri):
+    if uri.startswith('huggingface://'):
+        repo_id = uri.split('://')[1]
+        return 'huggingface', repo_id.rsplit('/', 1)[0]
+    elif 'huggingface.co' in uri:
+        parts = uri.split('/resolve/')
+        if len(parts) > 1:
+            repo_path = parts[0].split('https://huggingface.co/')[-1]
+            return 'huggingface', repo_path
+    return 'direct', uri
+
+def calculate_sha256(file_path):
+    sha256_hash = hashlib.sha256()
+    with open(file_path, 'rb') as f:
+        for byte_block in iter(lambda: f.read(4096), b''):
+            sha256_hash.update(byte_block)
+    return sha256_hash.hexdigest()
+
+download_type, repo_id_or_url = parse_uri(uri)
+
+new_checksum =  None
+
+# Decide download method based on URI type
+if download_type == 'huggingface':
+    # Use HF API to pull sha
+    for file in get_paths_info(repo_id_or_url, [file_name], repo_type='model'):
+        try:
+            new_checksum = file.lfs.sha256
+            break
+        except Exception as e:
+            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
+            sys.exit(2)
+    if new_checksum is None:
+        try:
+            file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
+        except Exception as e:
+            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
+            sys.exit(2)
+else:
+    response = requests.get(repo_id_or_url)
+    if response.status_code == 200:
+        with open(file_name, 'wb') as f:
+            f.write(response.content)
+        file_path = file_name
+    elif response.status_code == 404:
+        print(f'File not found: {response.status_code}', file=sys.stderr)
+        sys.exit(2)
+    else:
+        print(f'Error downloading file: {response.status_code}', file=sys.stderr)
+        sys.exit(1)
+
+if new_checksum is None:
+    new_checksum = calculate_sha256(file_path)
+    print(new_checksum)
+    os.remove(file_path)
+else:
+    print(new_checksum)
+
+")
+
+    if [[ "$new_checksum" == "" ]]; then
+        echo "Error calculating checksum for $file_name. Skipping..."
+        return
+    fi
+
+    echo "Checksum for $file_name: $new_checksum"
+
+    # Compare and update the YAML file if checksums do not match
+    result=$?
+    if [[ $result -eq 2 ]]; then
+        echo "File not found, deleting entry for $file_name..."
+        # yq eval -i "del(.[$idx].files[] | select(.filename == \"$file_name\"))" "$input_yaml"
+    elif [[ "$old_checksum" != "$new_checksum" ]]; then
+        echo "Checksum mismatch for $file_name. Updating..."
+        yq eval -i "del(.[$idx].files[] | select(.filename == \"$file_name\").sha256)" "$input_yaml"
+        yq eval -i "(.[$idx].files[] | select(.filename == \"$file_name\")).sha256 = \"$new_checksum\"" "$input_yaml"
+    elif [[ $result -ne 0 ]]; then
+        echo "Error downloading file $file_name. Skipping..."
+    else
+        echo "Checksum match for $file_name. No update needed."
+    fi
+}
+
+# Read the YAML and process each file
+len=$(yq eval '. | length' "$input_yaml")
+for ((i=0; i<$len; i++))
+do
+    name=$(yq eval ".[$i].name" "$input_yaml")
+    files_len=$(yq eval ".[$i].files | length" "$input_yaml")
+    for ((j=0; j<$files_len; j++))
+    do
+        filename=$(yq eval ".[$i].files[$j].filename" "$input_yaml")
+        uri=$(yq eval ".[$i].files[$j].uri" "$input_yaml")
+        checksum=$(yq eval ".[$i].files[$j].sha256" "$input_yaml")
+        echo "Checking model $name, file $filename. URI = $uri, Checksum = $checksum"
+        check_and_update_checksum "$name" "$filename" "$uri" "$checksum" "$i"
+    done
+done
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -0,0 +1,47 @@
+name: Check if checksums are up-to-date
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  checksum_check:
+    runs-on: arc-runner-set
+    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - uses: actions/checkout@v4
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y pip wget
+          sudo pip install --upgrade pip 
+          pip install huggingface_hub
+      - name: 'Setup yq'
+        uses: dcarbone/install-yq-action@v1.1.1
+        with:
+          version: 'v4.43.1'
+          download-compressed: true
+          force: true
+
+      - name: Checksum checker 🔧
+        run: |
+          export HF_HOME=/hf_cache
+          sudo mkdir /hf_cache
+          sudo chmod 777 /hf_cache
+          bash .github/checksum_checker.sh gallery/index.yaml
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: ':arrow_up: Checksum updates in gallery/index.yaml'
+          title: 'models(gallery): :arrow_up: update checksum'
+          branch: "update/checksum"
+          body: Updating checksums in gallery/index.yaml
+          signoff: true
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -84,7 +84,7 @@ jobs:
          build-args: |
            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.63.0
+            GRPC_VERSION=v1.64.0
          context: .
          file: ./Dockerfile
          cache-to: type=gha,ignore-error=true
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -0,0 +1,59 @@
+name: 'generate and publish intel docker caches'
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+concurrency:
+  group: intel-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  generate_caches:
+    strategy:
+      matrix:
+        include:
+          - base-image: intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
+            runs-on: 'ubuntu-latest'
+            platforms: 'linux/amd64'
+    runs-on: ${{matrix.runs-on}}
+    steps:
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Login to quay
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache Intel images
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=${{ matrix.base-image }}
+          context: .
+          file: ./Dockerfile
+          tags: quay.io/go-skynet/intel-oneapi-base:latest
+          push: true
+          target: intel
+          platforms: ${{ matrix.platforms }}
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -61,14 +61,14 @@ jobs:
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg'
            ffmpeg: 'true'
@@ -110,7 +110,7 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -129,7 +129,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
@@ -141,14 +141,14 @@ jobs:
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
@@ -161,7 +161,7 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
@@ -175,7 +175,7 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-core'
            ffmpeg: 'false'
@@ -185,7 +185,7 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-core'
            ffmpeg: 'false'
@@ -195,7 +195,7 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
@@ -205,7 +205,7 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
@@ -218,7 +218,7 @@ jobs:
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
@@ -228,7 +228,7 @@ jobs:
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -174,7 +174,6 @@ jobs:
            type=ref,event=branch
            type=semver,pattern={{raw}}
          flavor: |
-            latest=${{ inputs.tag-latest }}
            suffix=${{ inputs.aio }}

      - name: Set up QEMU
@@ -218,7 +217,7 @@ jobs:
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.63.0
+            GRPC_VERSION=v1.64.0
            MAKEFLAGS=${{ inputs.makeflags }}
          context: .
          file: ./Dockerfile
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,11 +1,11 @@
 name: Build and Release

-on: 
+on:
 - push
 - pull_request

 env:
-  GRPC_VERSION: v1.63.0
+  GRPC_VERSION: v1.64.0

 permissions:
  contents: write
@@ -16,19 +16,6 @@ concurrency:

 jobs:
  build-linux:
-    strategy:
-      matrix:
-        include:
-          - build: 'avx2'
-            defines: ''
-          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON'
-          - build: 'cuda12'
-            defines: ''
-          - build: 'cuda11'
-            defines: ''
    runs-on: ubuntu-latest
    steps:
      - name: Clone
@@ -42,19 +29,15 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg protobuf-compiler
+          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
      - name: Install CUDA Dependencies
-        if: ${{ matrix.build == 'cuda12' || matrix.build == 'cuda11' }}
        run: |
-          if [ "${{ matrix.build }}" == "cuda12" ]; then
-            export CUDA_VERSION=12-3
-          else
-            export CUDA_VERSION=11-7
-          fi
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
          sudo dpkg -i cuda-keyring_1.1-1_all.deb
          sudo apt-get update
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
+        env:
+          CUDA_VERSION: 12-3
      - name: Cache grpc
        id: cache-grpc
        uses: actions/cache@v4
@@ -73,23 +56,15 @@ jobs:
          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install
      - name: Build
        id: build
-        env:
-          CMAKE_ARGS: "${{ matrix.defines }}"
-          BUILD_ID: "${{ matrix.build }}"
        run: |
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
          export PATH=$PATH:$GOPATH/bin
-          if [ "${{ matrix.build }}" == "cuda12" ] || [ "${{ matrix.build }}" == "cuda11" ]; then
-            export BUILD_TYPE=cublas
-            export PATH=/usr/local/cuda/bin:$PATH
-            make dist
-          else
-            STATIC=true make dist
-          fi
+          export PATH=/usr/local/cuda/bin:$PATH
+          GO_TAGS=p2p make dist
      - uses: actions/upload-artifact@v4
        with:
-          name: LocalAI-linux-${{ matrix.build }}
+          name: LocalAI-linux
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
@@ -111,7 +86,7 @@ jobs:
          cache: false
      - name: Dependencies
        run: |
-          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build stablediffusion
@@ -119,68 +94,14 @@ jobs:
          export PATH=$PATH:$GOPATH/bin
          make backend-assets/grpc/stablediffusion
          mkdir -p release && cp backend-assets/grpc/stablediffusion release
+        env:
+          GO_TAGS: stablediffusion
      - uses: actions/upload-artifact@v4
        with:
          name: stablediffusion
          path: release/

-  build-macOS:
-    strategy:
-      matrix:
-        include:
-          - build: 'avx2'
-            defines: ''
-          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON'
-    runs-on: macOS-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21.x'
-          cache: false
-      - name: Dependencies
-        run: |
-          brew install protobuf grpc
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
-      - name: Build
-        id: build
-        env:
-          CMAKE_ARGS: "${{ matrix.defines }}"
-          BUILD_ID: "${{ matrix.build }}"
-        run: |
-          export C_INCLUDE_PATH=/usr/local/include
-          export CPLUS_INCLUDE_PATH=/usr/local/include
-          export PATH=$PATH:$GOPATH/bin
-          make dist
-      - uses: actions/upload-artifact@v4
-        with:
-          name: LocalAI-MacOS-${{ matrix.build }}
-          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v2
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*
-
-
  build-macOS-arm64:
-    strategy:
-      matrix:
-        include:
-          - build: 'avx2'
-            defines: ''
-          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON'
    runs-on: macos-14
    steps:
      - name: Clone
@@ -198,17 +119,14 @@ jobs:
          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build
        id: build
-        env:
-          CMAKE_ARGS: "${{ matrix.defines }}"
-          BUILD_ID: "${{ matrix.build }}"
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-          make dist
+          GO_TAGS=p2p make dist
      - uses: actions/upload-artifact@v4
        with:
-          name: LocalAI-MacOS-arm64-${{ matrix.build }}
+          name: LocalAI-MacOS-arm64
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -25,22 +25,14 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0
          
-          sudo rm -rfv /usr/bin/conda || true
-
      - name: Test transformers
        run: |
-           export PATH=$PATH:/opt/conda/bin
           make --jobs=5 --output-sync=target -C backend/python/transformers
           make --jobs=5 --output-sync=target -C backend/python/transformers test

@@ -55,22 +47,14 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0
          
-          sudo rm -rfv /usr/bin/conda || true
-
      - name: Test sentencetransformers
        run: |
-           export PATH=$PATH:/opt/conda/bin
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test

@@ -86,22 +70,14 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
-          
-          sudo rm -rfv /usr/bin/conda || true
+          pip install --user grpcio-tools==1.64.0

      - name: Test rerankers
        run: |
-           export PATH=$PATH:/opt/conda/bin
           make --jobs=5 --output-sync=target -C backend/python/rerankers
           make --jobs=5 --output-sync=target -C backend/python/rerankers test

@@ -115,25 +91,16 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
+          sudo apt-get install -y build-essential ffmpeg
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
-          
-          sudo rm -rfv /usr/bin/conda || true
-
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          pip install --user grpcio-tools==1.64.0
      - name: Test diffusers
        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make --jobs=5 --output-sync=target -C backend/python/diffusers
-           make --jobs=5 --output-sync=target -C backend/python/diffusers test
+          make --jobs=5 --output-sync=target -C backend/python/diffusers
+          make --jobs=5 --output-sync=target -C backend/python/diffusers test

  tests-parler-tts:
    runs-on: ubuntu-latest
@@ -146,24 +113,38 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
-          
-          sudo rm -rfv /usr/bin/conda || true
+          pip install --user grpcio-tools==1.64.0

      - name: Test parler-tts
        run: |
-           export PATH=$PATH:/opt/conda/bin
           make --jobs=5 --output-sync=target -C backend/python/parler-tts
           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
+  
+  tests-openvoice:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.64.0
+
+      - name: Test openvoice
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/openvoice
+           make --jobs=5 --output-sync=target -C backend/python/openvoice test

  tests-transformers-musicgen:
    runs-on: ubuntu-latest
@@ -176,22 +157,14 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
-          
-          sudo rm -rfv /usr/bin/conda || true
+          pip install --user grpcio-tools==1.64.0

      - name: Test transformers-musicgen
        run: |
-           export PATH=$PATH:/opt/conda/bin
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test

@@ -208,22 +181,14 @@ jobs:
  #       run: |
  #         sudo apt-get update
  #         sudo apt-get install build-essential ffmpeg
-  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-  #            sudo apt-get update && \
-  #            sudo apt-get install -y conda
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools==1.63.0
-          
-  #         sudo rm -rfv /usr/bin/conda || true
+  #         pip install --user grpcio-tools==1.64.0

  #     - name: Test petals
  #       run: |
-  #          export PATH=$PATH:/opt/conda/bin
  #          make --jobs=5 --output-sync=target -C backend/python/petals
  #          make --jobs=5 --output-sync=target -C backend/python/petals test

@@ -280,22 +245,14 @@ jobs:
  #       run: |
  #         sudo apt-get update
  #         sudo apt-get install build-essential ffmpeg
-  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-  #            sudo apt-get update && \
-  #            sudo apt-get install -y conda
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools==1.63.0
-          
-  #         sudo rm -rfv /usr/bin/conda || true
+  #         pip install --user grpcio-tools==1.64.0

  #     - name: Test bark
  #       run: |
-  #          export PATH=$PATH:/opt/conda/bin
  #          make --jobs=5 --output-sync=target -C backend/python/bark
  #          make --jobs=5 --output-sync=target -C backend/python/bark test

@@ -313,20 +270,13 @@ jobs:
  #       run: |
  #         sudo apt-get update
  #         sudo apt-get install build-essential ffmpeg
-  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-  #            sudo apt-get update && \
-  #            sudo apt-get install -y conda
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools==1.63.0
-  #         sudo rm -rfv /usr/bin/conda || true
+  #         pip install --user grpcio-tools==1.64.0
  #     - name: Test vllm
  #       run: |
-  #          export PATH=$PATH:/opt/conda/bin
  #          make --jobs=5 --output-sync=target -C backend/python/vllm
  #          make --jobs=5 --output-sync=target -C backend/python/vllm test
  tests-vallex:
@@ -340,20 +290,13 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
-          sudo rm -rfv /usr/bin/conda || true
+          pip install --user grpcio-tools==1.64.0
      - name: Test vall-e-x
        run: |
-           export PATH=$PATH:/opt/conda/bin
           make --jobs=5 --output-sync=target -C backend/python/vall-e-x
           make --jobs=5 --output-sync=target -C backend/python/vall-e-x test

@@ -368,19 +311,11 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
-          pip install --user grpcio-tools==1.63.0
-          sudo rm -rfv /usr/bin/conda || true
-
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          pip install --user grpcio-tools==1.64.0
      - name: Test coqui
        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make --jobs=5 --output-sync=target -C backend/python/coqui
-           make --jobs=5 --output-sync=target -C backend/python/coqui test
+          make --jobs=5 --output-sync=target -C backend/python/coqui
+          make --jobs=5 --output-sync=target -C backend/python/coqui test
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ on:
      - '*'

 env:
-  GRPC_VERSION: v1.63.0
+  GRPC_VERSION: v1.64.0

 concurrency:
  group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -57,7 +57,7 @@ jobs:
          df -h
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
        uses: actions/setup-go@v5
@@ -78,6 +78,8 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
          sudo apt-get install -y libopencv-dev

@@ -85,6 +87,12 @@ jobs:
          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
          rm protoc.zip

+          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
+          export CUDACXX=/usr/local/cuda/bin/nvcc
+
          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest

@@ -100,6 +108,8 @@ jobs:
          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+        env:
+          CUDA_VERSION: 12-3
      - name: Cache grpc
        id: cache-grpc
        uses: actions/cache@v4
@@ -164,7 +174,7 @@ jobs:
          df -h
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Build images
        run: |
@@ -190,7 +200,7 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
        uses: actions/setup-go@v5
@@ -203,7 +213,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,9 @@ get-sources
 prepare-sources
 /backend/cpp/llama/grpc-server
 /backend/cpp/llama/llama.cpp
+/backend/cpp/llama-*
+
+*.log

 go-ggml-transformers
 go-gpt2
@@ -46,4 +49,7 @@ prepare
 *pb2_grpc.py

 # SonarQube
-.scannerwork
+.scannerwork
+
+# backend virtual environments
+**/venv
--- a/146
+++ b/146
@@ -1,6 +1,7 @@
 ARG IMAGE_TYPE=extras
 ARG BASE_IMAGE=ubuntu:22.04
 ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
+ARG INTEL_BASE_IMAGE=${BASE_IMAGE}

 # The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
 FROM ${BASE_IMAGE} AS requirements-core
@@ -12,13 +13,13 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"

-ARG GO_TAGS="stablediffusion tinydream tts"

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
+        ccache \
        ca-certificates \
        cmake \
        curl \
@@ -76,26 +77,16 @@ RUN test -n "$TARGETARCH" \
 # The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
 FROM requirements-core AS requirements-extras

-RUN apt-get update && \
-    apt-get install -y --no-install-recommends gpg && \
-    curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        conda && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
 ENV PATH="/root/.cargo/bin:${PATH}"

 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        espeak-ng \
-        espeak && \
+        espeak \
+        python3-dev \
+        python3-venv && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

@@ -140,6 +131,29 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
        rm -rf /var/lib/apt/lists/* \
    ; fi

+RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            hipblas-dev \
+            rocblas-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
+        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
+        ldconfig \
+    ; fi
+
+###################################
+###################################
+
+# Temporary workaround for Intel's repository to work correctly
+# https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/APT-Repository-not-working-signatures-invalid/m-p/1599436/highlight/true#M36143
+# This is a temporary workaround until Intel fixes their repository
+FROM ${INTEL_BASE_IMAGE} AS intel
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
+gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
+RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" > /etc/apt/sources.list.d/intel-graphics.list
+
 ###################################
 ###################################

@@ -182,7 +196,7 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
 # Adjustments to the build process should likely be made here.
 FROM requirements-drivers AS builder

-ARG GO_TAGS="stablediffusion tts"
+ARG GO_TAGS="stablediffusion tts p2p"
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS

@@ -234,6 +248,7 @@ ARG FFMPEG
 ARG BUILD_TYPE
 ARG TARGETARCH
 ARG IMAGE_TYPE=extras
+ARG EXTRA_BACKENDS
 ARG MAKEFLAGS

 ENV BUILD_TYPE=${BUILD_TYPE}
@@ -245,7 +260,6 @@ ARG CUDA_MAJOR_VERSION=11
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
-ENV PIP_CACHE_PURGE=true

 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
@@ -278,51 +292,61 @@ COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
 # do not let stablediffusion rebuild (requires an older version of absl)
 COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion

-## Duplicated from Makefile to avoid having a big layer that's hard to push
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/autogptq \
+# Change the shell to bash so we can use [[ tests below
+SHELL ["/bin/bash", "-c"]
+# We try to strike a balance between individual layer size (as that affects total push time) and total image size
+# Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer
+# Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer
+
+RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/coqui \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/parler-tts \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/diffusers \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/transformers-musicgen \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/exllama \
    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/bark \
+
+RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/vall-e-x \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/openvoice \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "petals" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/petals \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/sentencetransformers \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/exllama2 \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/transformers \
    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/diffusers \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/vllm \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/mamba \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/sentencetransformers \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/rerankers \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/transformers \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/vall-e-x \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/exllama \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/exllama2 \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/petals \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/transformers-musicgen \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/parler-tts \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/coqui \
+
+RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/vllm \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/autogptq \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/bark \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/rerankers \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/mamba \
    ; fi

 # Make sure the models directory exists
--- a/108
+++ b/108
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai

 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
+CPPLLAMA_VERSION?=74f33adf5f8b20b08fc5a6aa17ce081abe86ef2f

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -16,10 +16,10 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=8fac6455ffeb0a0950a84e790ddb74f7290d33c4
+WHISPER_CPP_VERSION?=22d46b7ba4620e2db1281e210d0186863cffcec0

 # bert.cpp version
-BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
+BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4

 # go-piper version
 PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
@@ -38,7 +38,7 @@ CGO_LDFLAGS?=
 CGO_LDFLAGS_WHISPER?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
-BUILD_ID?=git
+BUILD_ID?=

 TEST_DIR=/tmp/test

@@ -70,7 +70,7 @@ UNAME_S := $(shell uname -s)
 endif

 ifeq ($(OS),Darwin)
-	
+
 	ifeq ($(OSX_SIGNING_IDENTITY),)
 		OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
 	endif
@@ -152,10 +152,14 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface
+ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
 ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
-ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
+ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -293,6 +297,7 @@ clean: ## Remove build related file
 	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/cpp/llama clean
+	rm -rf backend/cpp/llama-* || true
 	$(MAKE) dropreplace
 	$(MAKE) protogen-clean
 	rmdir pkg/grpc/proto || true
@@ -311,14 +316,28 @@ build: prepare backend-assets grpcs ## Build the project
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./

 build-minimal:
-	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS=backend-assets/grpc/llama-cpp GO_TAGS=none $(MAKE) build
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=none $(MAKE) build

 build-api:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build

-dist: build
+dist:
+	STATIC=true $(MAKE) backend-assets/grpc/llama-cpp-avx2
+ifeq ($(OS),Darwin)
+	$(info ${GREEN}I Skip CUDA build on MacOS${RESET})
+else
+	$(MAKE) backend-assets/grpc/llama-cpp-cuda
+endif
+	$(MAKE) build
 	mkdir -p release
+# if BUILD_ID is empty, then we don't append it to the binary name
+ifeq ($(BUILD_ID),)
+	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(OS)-$(ARCH)
+	shasum -a 256 release/$(BINARY_NAME)-$(OS)-$(ARCH) > release/$(BINARY_NAME)-$(OS)-$(ARCH).sha256
+else
 	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)
+	shasum -a 256 release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH) > release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH).sha256
+endif

 osx-signed: build
 	codesign --deep --force --sign "$(OSX_SIGNING_IDENTITY)" --entitlements "./Entitlements.plist" "./$(BINARY_NAME)"
@@ -437,10 +456,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean

 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -554,6 +573,14 @@ vall-e-x-protogen:
 vall-e-x-protogen-clean:
 	$(MAKE) -C backend/python/vall-e-x protogen-clean

+.PHONY: openvoice-protogen
+openvoice-protogen:
+	$(MAKE) -C backend/python/openvoice protogen
+
+.PHONY: openvoice-protogen-clean
+openvoice-protogen-clean:
+	$(MAKE) -C backend/python/openvoice protogen-clean
+
 .PHONY: vllm-protogen
 vllm-protogen:
 	$(MAKE) -C backend/python/vllm protogen
@@ -577,6 +604,7 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/transformers-musicgen
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
+	$(MAKE) -C backend/python/openvoice
 	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/petals
 	$(MAKE) -C backend/python/exllama2
@@ -616,8 +644,8 @@ backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/go
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/

-backend-assets/grpc/langchain-huggingface: backend-assets/grpc
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/
+backend-assets/grpc/huggingface: backend-assets/grpc
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/

 backend/cpp/llama/llama.cpp:
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
@@ -629,7 +657,7 @@ ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
 				 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
 				 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
 				 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
-backend/cpp/llama/grpc-server:
+build-llama-cpp-grpc-server:
 # Conditionally build grpc for the llama backend to use if needed
 ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
 	$(MAKE) -C backend/cpp/grpc build
@@ -638,19 +666,55 @@ ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
 	PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
 	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) \
-	$(MAKE) -C backend/cpp/llama grpc-server
+	$(MAKE) -C backend/cpp/${VARIANT} grpc-server
 else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
-	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
 endif

-backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
-	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
+backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-avx2
+	$(MAKE) -C backend/cpp/llama-avx2 purge
+	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
+
+backend-assets/grpc/llama-cpp-avx: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-avx
+	$(MAKE) -C backend/cpp/llama-avx purge
+	$(info ${GREEN}I llama-cpp build info:avx${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx
+
+backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-fallback
+	$(MAKE) -C backend/cpp/llama-fallback purge
+	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
+	cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif

+backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-cuda
+	$(MAKE) -C backend/cpp/llama-cuda purge
+	$(info ${GREEN}I llama-cpp build info:cuda${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
+
+backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-grpc
+	$(MAKE) -C backend/cpp/llama-grpc purge
+	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
+
+backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
+	mkdir -p backend-assets/util/
+	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
+
 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
@@ -693,7 +757,7 @@ docker:
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
 		--build-arg BUILD_TYPE=$(BUILD_TYPE) \
 		-t $(DOCKER_IMAGE) .
-	
+
 docker-aio:
 	@echo "Building AIO image with base $(BASE_IMAGE) as $(DOCKER_AIO_IMAGE)"
 	docker build \
@@ -724,3 +788,7 @@ docker-image-intel-xpu:
 .PHONY: swagger
 swagger:
 	swag init -g core/http/app.go --output swagger
+
+.PHONY: gen-assets
+gen-assets:
+	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
--- a/README.md
+++ b/README.md
@@ -46,17 +46,31 @@

 **LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

+![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
+
+```bash
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+# Alternative images:
+# - if you have an Nvidia GPU:
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
+# - without preconfigured models
+# docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
+# - without preconfigured models for Nvidia GPUs
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 
+```
+
+[💻 Getting started](https://localai.io/basics/getting_started/index.html)
+
 ## 🔥🔥 Hot topics / Roadmap

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

+- 🔥🔥 Decentralized llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!)
+- 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
+- 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
+- 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
+- Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - Reranker API: https://github.com/mudler/LocalAI/pull/2121
- Gallery WebUI: https://github.com/mudler/LocalAI/pull/2104
- llama3: https://github.com/mudler/LocalAI/discussions/2076
- Parler-TTS: https://github.com/mudler/LocalAI/pull/2027
- Openvino support: https://github.com/mudler/LocalAI/pull/1892
- Vector store: https://github.com/mudler/LocalAI/pull/1795
- All-in-one container image: https://github.com/mudler/LocalAI/issues/1855

 Hot topics (looking for contributors):

@@ -69,18 +83,6 @@ Hot topics (looking for contributors):

 If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22

-## 💻 [Getting started](https://localai.io/basics/getting_started/index.html)
-
-For a detailed step-by-step introduction, refer to the [Getting Started](https://localai.io/basics/getting_started/index.html) guide. 
-
-For those in a hurry, here's a straightforward one-liner to launch a LocalAI AIO(All-in-one) Image using `docker`:
-
-```bash
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
-# or, if you have an Nvidia GPU:
-# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
-```
-
 ## 🚀 [Features](https://localai.io/features/)

 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@@ -113,8 +115,9 @@ Model galleries
 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
+- Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation
+- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
 - Discord bot https://github.com/mudler/LocalAGI/tree/main/examples/discord
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
@@ -131,7 +134,7 @@ Other:

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

- [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/ai/answers/tiZMDoZzZV6TLxgDXNBnFE/deploying-helm-charts-on-aws-eks)
+- [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/blog/low-code-llm-apps-with-local-ai-flowise-and-pulumi/)
 - [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
 - [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
 - [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -2,8 +2,63 @@ name: gpt-4
 mmap: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192
+
+stopwords:
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""

 template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
    {{- if .FunctionCall }}
@@ -22,38 +77,25 @@ template:
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
-  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
-  function: |
+  completion: |
+    {{.Input}}
+  function: |-
    <|im_start|>system
-    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    You are a function calling AI model.
+    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    </tools>
-    Use the following pydantic model json schema for each tool call you will make:
-    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
-    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
-    {'arguments': <args-dict>, 'name': <function-name>}
+    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
-    <tool_call>
-  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  completion: |
-    {{.Input}}
-context_size: 4096
-f16: true
-stopwords:
- <|im_end|>
- <dummy32000>
- "\n</tool_call>"
- "\n\n\n"
-usage: |
-      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-          "model": "gpt-4",
-          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
-      }'
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -2,8 +2,63 @@ name: gpt-4
 mmap: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192
+
+stopwords:
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""

 template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
    {{- if .FunctionCall }}
@@ -22,38 +77,25 @@ template:
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
-  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
-  function: |
+  completion: |
+    {{.Input}}
+  function: |-
    <|im_start|>system
-    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    You are a function calling AI model.
+    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    </tools>
-    Use the following pydantic model json schema for each tool call you will make:
-    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
-    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
-    {'arguments': <args-dict>, 'name': <function-name>}
+    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
-    <|im_start|>assistant
-    <tool_call>
-  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  completion: |
-    {{.Input}}
-context_size: 4096
-f16: true
-stopwords:
- <|im_end|>
- <dummy32000>
- "\n</tool_call>"
- "\n\n\n"
-usage: |
-      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-          "model": "gpt-4",
-          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
-      }'
+    <|im_start|>assistant
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -1,10 +1,66 @@
 name: gpt-4
 mmap: false
+context_size: 8192
+
 f16: false
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf

+stopwords:
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+
 template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
    {{- if .FunctionCall }}
@@ -23,37 +79,25 @@ template:
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
-  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
-  function: |
+  completion: |
+    {{.Input}}
+  function: |-
    <|im_start|>system
-    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    You are a function calling AI model.
+    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    </tools>
-    Use the following pydantic model json schema for each tool call you will make:
-    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
-    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
-    {'arguments': <args-dict>, 'name': <function-name>}
+    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
-    <tool_call>
-  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  completion: |
-    {{.Input}}
-context_size: 4096
-stopwords:
- <|im_end|>
- "\n</tool_call>"
- <dummy32000>
- "\n\n\n"
-usage: |
-      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-          "model": "gpt-4",
-          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
-      }'
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -212,6 +212,9 @@ message ModelOptions {
  float YarnBetaSlow = 47;

  string Type = 49;
+
+  bool FlashAttention = 56;
+  bool NoKVOffload = 57;
 }

 message Result {
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -43,35 +43,27 @@ llama.cpp:

 llama.cpp/examples/grpc-server: llama.cpp
 	mkdir -p llama.cpp/examples/grpc-server
-	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
-	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
-	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
-	cp -rfv $(abspath ./)/utils.hpp llama.cpp/examples/grpc-server/
-	echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
-## XXX: In some versions of CMake clip wasn't being built before llama.
-## This is an hack for now, but it should be fixed in the future.
-	cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
-	cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
-	echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
-	cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
-	cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+	bash prepare.sh

 rebuild:
-	cp -rfv $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
-	cp -rfv $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
-	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
+	bash prepare.sh
 	rm -rf grpc-server
 	$(MAKE) grpc-server

-clean:
-	rm -rf llama.cpp
+purge:
+	rm -rf llama.cpp/build
+	rm -rf llama.cpp/examples/grpc-server
 	rm -rf grpc-server

+clean: purge
+	rm -rf llama.cpp
+
 grpc-server: llama.cpp llama.cpp/examples/grpc-server
+	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	bash -c "source $(ONEAPI_VARS); \
-	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release"	
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)"
 else
-	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)
 endif
 	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -791,7 +791,7 @@ struct llama_server_context
                    sampler_names.emplace_back(sampler_name);
                }
            }
-            slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
+            slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
        }
        else
        {
@@ -1146,7 +1146,7 @@ struct llama_server_context
        std::vector<std::string> samplers_sequence;
        for (const auto &sampler_type : slot.sparams.samplers_sequence)
        {
-            samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
+            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
        }

        return json {
@@ -2217,6 +2217,12 @@ static void params_parse(const backend::ModelOptions* request,
    } else {
        params.n_parallel = 1;
    }
+
+    const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
+    if (llama_grpc_servers != NULL) {
+        params.rpc_servers = std::string(llama_grpc_servers);
+    }
+    
    // TODO: Add yarn

    if (!request->tensorsplit().empty()) {
@@ -2254,6 +2260,9 @@ static void params_parse(const backend::ModelOptions* request,
    }
    params.use_mlock = request->mlock();
    params.use_mmap = request->mmap();
+    params.flash_attn = request->flashattention();
+    params.no_kv_offload = request->nokvoffload();
+
    params.embedding = request->embeddings();

    if (request->ropescaling() == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
+cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
+cp -rfv json.hpp llama.cpp/examples/grpc-server/
+cp -rfv utils.hpp llama.cpp/examples/grpc-server/
+    
+if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
+    echo "grpc-server already added"
+else
+    echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+fi
+
+## XXX: In some versions of CMake clip wasn't being built before llama.
+## This is an hack for now, but it should be fixed in the future.
+cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
+echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
+cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
+cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
--- a/backend/go/llm/langchain/langchain.go
+++ b/backend/go/llm/langchain/langchain.go
@@ -4,6 +4,7 @@ package main
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
+	"os"

 	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
@@ -18,9 +19,14 @@ type LLM struct {
 }

 func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	llm.langchain, _ = langchain.NewHuggingFace(opts.Model)
+	var err error
+	hfToken := os.Getenv("HUGGINGFACEHUB_API_TOKEN")
+	if hfToken == "" {
+		return fmt.Errorf("no huggingface token provided")
+	}
+	llm.langchain, err = langchain.NewHuggingFace(opts.Model, hfToken)
 	llm.model = opts.Model
-	return nil
+	return err
 }

 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,6 +1,6 @@
 .PHONY: autogptq
 autogptq: protogen
-	$(MAKE) -C ../common-env/transformers
+	bash install.sh

 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
@@ -10,4 +10,8 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

 backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
--- a/backend/python/autogptq/autogptq.yml
+++ b/backend/python/autogptq/autogptq.yml
@@ -1,93 +0,0 @@
-####
-# Attention! This file is abandoned. 
-# Please use the ../common-env/transformers/transformers.yml file to manage dependencies.
-###
-name: autogptq
-channels:
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=5.1=1_gnu
-  - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2023.08.22=h06a4308_0
-  - ld_impl_linux-64=2.38=h1181459_1
-  - libffi=3.4.4=h6a678d5_0
-  - libgcc-ng=11.2.0=h1234567_1
-  - libgomp=11.2.0=h1234567_1
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - libuuid=1.41.5=h5eee18b_0
-  - ncurses=6.4=h6a678d5_0
-  - openssl=3.0.11=h7f8727e_2
-  - pip=23.2.1=py311h06a4308_0
-  - python=3.11.5=h955ad1f_0
-  - readline=8.2=h5eee18b_0
-  - setuptools=68.0.0=py311h06a4308_0
-  - sqlite=3.41.2=h5eee18b_0
-  - tk=8.6.12=h1ccaba5_0
-  - wheel=0.41.2=py311h06a4308_0
-  - xz=5.4.2=h5eee18b_0
-  - zlib=1.2.13=h5eee18b_0
-  - pip:
-      - accelerate==0.27.0
-      - aiohttp==3.8.5
-      - aiosignal==1.3.1
-      - async-timeout==4.0.3
-      - attrs==23.1.0
-      - auto-gptq==0.7.1
-      - certifi==2023.7.22
-      - charset-normalizer==3.3.0
-      - datasets==2.14.5
-      - dill==0.3.7
-      - filelock==3.12.4
-      - frozenlist==1.4.0
-      - fsspec==2023.6.0
-      - grpcio==1.63.0
-      - huggingface-hub==0.16.4
-      - idna==3.4
-      - jinja2==3.1.2
-      - markupsafe==2.1.3
-      - mpmath==1.3.0
-      - multidict==6.0.4
-      - multiprocess==0.70.15
-      - networkx==3.1
-      - numpy==1.26.0
-      - nvidia-cublas-cu12==12.1.3.1
-      - nvidia-cuda-cupti-cu12==12.1.105
-      - nvidia-cuda-nvrtc-cu12==12.1.105
-      - nvidia-cuda-runtime-cu12==12.1.105
-      - nvidia-cudnn-cu12==8.9.2.26
-      - nvidia-cufft-cu12==11.0.2.54
-      - nvidia-curand-cu12==10.3.2.106
-      - nvidia-cusolver-cu12==11.4.5.107
-      - nvidia-cusparse-cu12==12.1.0.106
-      - nvidia-nccl-cu12==2.18.1
-      - nvidia-nvjitlink-cu12==12.2.140
-      - nvidia-nvtx-cu12==12.1.105
-      - optimum==1.17.1
-      - packaging==23.2
-      - pandas==2.1.1
-      - peft==0.5.0
-      - protobuf==4.24.4
-      - psutil==5.9.5
-      - pyarrow==13.0.0
-      - python-dateutil==2.8.2
-      - pytz==2023.3.post1
-      - pyyaml==6.0.1
-      - regex==2023.10.3
-      - requests==2.31.0
-      - rouge==1.0.1
-      - safetensors>=0.3.3
-      - six==1.16.0
-      - sympy==1.12
-      - tokenizers==0.14.0
-      - tqdm==4.66.1
-      - torch==2.2.1
-      - torchvision==0.17.1
-      - transformers==4.34.0
-      - transformers_stream_generator==0.0.5
-      - triton==2.1.0
-      - typing-extensions==4.8.0
-      - tzdata==2023.3
-      - urllib3==2.0.6
-      - xxhash==3.4.1
-      - yarl==1.9.2
--- a/backend/python/autogptq/autogptq.py
+++ b/backend/python/autogptq/autogptq.py
--- a/backend/python/autogptq/install.sh
+++ b/backend/python/autogptq/install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
+# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
+# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
+# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+if [ "x${BUILD_PROFILE}" == "xintel" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+fi
+
+installRequirements
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -0,0 +1,5 @@
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+intel-extension-for-pytorch
+torch
+optimum[openvino]
+setuptools
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -0,0 +1,7 @@
+accelerate
+auto-gptq==0.7.1
+grpcio==1.64.0
+protobuf
+torch
+certifi
+transformers
--- a/backend/python/autogptq/run.sh
+++ b/backend/python/autogptq/run.sh
@@ -1,14 +1,4 @@
 #!/bin/bash
+source $(dirname $0)/../common/libbackend.sh

-##
-## A bash script wrapper that runs the autogptq server with conda
-
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python $DIR/autogptq.py $@
+startBackend $@
--- a/backend/python/autogptq/test.sh
+++ b/backend/python/autogptq/test.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+runUnittests
--- a/backend/python/bark/Makefile
+++ b/backend/python/bark/Makefile
@@ -1,6 +1,6 @@
 .PHONY: ttsbark
 ttsbark: protogen
-	$(MAKE) -C ../common-env/transformers
+	bash install.sh

 .PHONY: run
 run: protogen
@@ -22,4 +22,8 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

 backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
--- a/backend/python/bark/backend.py
+++ b/backend/python/bark/backend.py
--- a/backend/python/bark/install.sh
+++ b/backend/python/bark/install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
+# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
+# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
+# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+if [ "x${BUILD_PROFILE}" == "xintel" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+fi
+
+installRequirements
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+intel-extension-for-pytorch
+torch
+torchaudio
+optimum[openvino]
+setuptools
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -0,0 +1,6 @@
+accelerate
+bark==0.1.5
+grpcio==1.64.0
+protobuf
+certifi
+transformers
--- a/backend/python/bark/run.sh
+++ b/backend/python/bark/run.sh
@@ -1,14 +1,4 @@
 #!/bin/bash
+source $(dirname $0)/../common/libbackend.sh

-##
-## A bash script wrapper that runs the ttsbark server with conda
-
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python $DIR/ttsbark.py $@
+startBackend $@
--- a/backend/python/bark/test.py
+++ b/backend/python/bark/test.py
@@ -18,7 +18,7 @@ class TestBackendServicer(unittest.TestCase):
        """
        This method sets up the gRPC service by starting the server
        """
-        self.service = subprocess.Popen(["python3", "ttsbark.py", "--addr", "localhost:50051"])
+        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
        time.sleep(10)

    def tearDown(self) -> None:
--- a/backend/python/bark/test.sh
+++ b/backend/python/bark/test.sh
@@ -1,11 +1,6 @@
 #!/bin/bash
-##
-## A bash script wrapper that runs the bark server with conda
+set -e

-# Activate conda environment
-source activate transformers
+source $(dirname $0)/../common/libbackend.sh

-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python -m unittest $DIR/test.py
+runUnittests
--- a/backend/python/common-env/transformers/Makefile
+++ b/backend/python/common-env/transformers/Makefile
@@ -1,21 +0,0 @@
-CONDA_ENV_PATH = "transformers.yml"
-
-ifeq ($(BUILD_TYPE), cublas)
-	CONDA_ENV_PATH = "transformers-nvidia.yml"
-endif
-
-ifeq ($(BUILD_TYPE), hipblas)
-	CONDA_ENV_PATH = "transformers-rocm.yml"
-endif
-
-# Intel GPU are supposed to have dependencies installed in the main python
-# environment, so we skip conda installation for SYCL builds.
-# https://github.com/intel/intel-extension-for-pytorch/issues/538
-ifneq (,$(findstring sycl,$(BUILD_TYPE)))
-export SKIP_CONDA=1
-endif
-
-.PHONY: transformers
-transformers:
-	@echo "Installing $(CONDA_ENV_PATH)..."
-	bash install.sh $(CONDA_ENV_PATH)
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -1,44 +0,0 @@
-#!/bin/bash
-set -ex
-
-SKIP_CONDA=${SKIP_CONDA:-0}
-REQUIREMENTS_FILE=$1
-
-# Check if environment exist
-conda_env_exists(){
-    ! conda list --name "${@}" >/dev/null 2>/dev/null
-}
-
-if [ $SKIP_CONDA -eq 1 ]; then
-    echo "Skipping conda environment installation"
-else
-    export PATH=$PATH:/opt/conda/bin
-    if conda_env_exists "transformers" ; then
-        echo "Creating virtual environment..."
-        conda env create --name transformers --file $REQUIREMENTS_FILE
-        echo "Virtual environment created."
-    else 
-        echo "Virtual environment already exists."
-    fi
-fi
-
-if [ -d "/opt/intel" ]; then
-    # Intel GPU: If the directory exists, we assume we are using the intel image
-    # (no conda env)
-    # https://github.com/intel/intel-extension-for-pytorch/issues/538
-    pip install torch==2.1.0.post0 torchvision==0.16.0.post0 torchaudio==2.1.0.post0 intel-extension-for-pytorch==2.1.20+xpu oneccl_bind_pt==2.1.200+xpu intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed optimum[openvino] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-fi
-
-# If we didn't skip conda, activate the environment
-# to install FlashAttention
-if [ $SKIP_CONDA -eq 0 ]; then
-    source activate transformers
-fi
-if [[ $REQUIREMENTS_FILE =~ -nvidia.yml$ ]]; then
-    #TODO: FlashAttention is supported on nvidia and ROCm, but ROCm install can't be done this easily
-    pip install flash-attn --no-build-isolation
-fi
-
-if [ "$PIP_CACHE_PURGE" = true ] ; then
-    pip cache purge
-fi
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -1,125 +0,0 @@
-name: transformers
-channels:
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=5.1=1_gnu
-  - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2023.08.22=h06a4308_0
-  - ld_impl_linux-64=2.38=h1181459_1
-  - libffi=3.4.4=h6a678d5_0
-  - libgcc-ng=11.2.0=h1234567_1
-  - libgomp=11.2.0=h1234567_1
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - libuuid=1.41.5=h5eee18b_0
-  - ncurses=6.4=h6a678d5_0
-  - openssl=3.0.11=h7f8727e_2
-  - pip=23.2.1=py311h06a4308_0
-  - python=3.11.5=h955ad1f_0
-  - readline=8.2=h5eee18b_0
-  - setuptools=68.0.0=py311h06a4308_0
-  - sqlite=3.41.2=h5eee18b_0
-  - tk=8.6.12=h1ccaba5_0
-  - wheel=0.41.2=py311h06a4308_0
-  - xz=5.4.2=h5eee18b_0
-  - zlib=1.2.13=h5eee18b_0
-  - pip:
-      - accelerate==0.27.0
-      - aiohttp==3.8.5
-      - aiosignal==1.3.1
-      - async-timeout==4.0.3
-      - auto-gptq==0.7.1
-      - attrs==23.1.0
-      - bark==0.1.5
-      - bitsandbytes==0.43.0
-      - boto3==1.28.61
-      - botocore==1.31.61
-      - certifi==2023.7.22
-      - TTS==0.22.0
-      - charset-normalizer==3.3.0
-      - datasets==2.14.5
-      - sentence-transformers==2.5.1 # Updated Version
-      - sentencepiece==0.1.99
-      - dill==0.3.7
-      - einops==0.7.0
-      - encodec==0.1.1
-      - filelock==3.12.4
-      - frozenlist==1.4.0
-      - fsspec==2023.6.0
-      - funcy==2.0
-      - grpcio==1.63.0
-      - huggingface-hub
-      - idna==3.4
-      - jinja2==3.1.2
-      - jmespath==1.0.1
-      - markupsafe==2.1.3
-      - mpmath==1.3.0
-      - multidict==6.0.4
-      - multiprocess==0.70.15
-      - networkx
-      - numpy==1.26.0
-      - nvidia-cublas-cu12==12.1.3.1
-      - nvidia-cuda-cupti-cu12==12.1.105
-      - nvidia-cuda-nvrtc-cu12==12.1.105
-      - nvidia-cuda-runtime-cu12==12.1.105
-      - nvidia-cudnn-cu12==8.9.2.26
-      - nvidia-cufft-cu12==11.0.2.54
-      - nvidia-curand-cu12==10.3.2.106
-      - nvidia-cusolver-cu12==11.4.5.107
-      - nvidia-cusparse-cu12==12.1.0.106
-      - nvidia-nccl-cu12==2.18.1
-      - nvidia-nvjitlink-cu12==12.2.140
-      - nvidia-nvtx-cu12==12.1.105
-      - optimum==1.17.1
-      - packaging==23.2
-      - pandas
-      - peft==0.5.0
-      - protobuf==4.24.4
-      - psutil==5.9.5
-      - pyarrow==13.0.0
-      - python-dateutil==2.8.2
-      - pytz==2023.3.post1
-      - pyyaml==6.0.1
-      - regex==2023.10.3
-      - requests==2.31.0
-      - rouge==1.0.1
-      - s3transfer==0.7.0
-      - safetensors>=0.4.1
-      - scipy==1.12.0 # Updated Version
-      - six==1.16.0
-      - sympy==1.12
-      - tokenizers
-      - torch==2.1.2
-      - torchvision==0.16.2
-      - torchaudio==2.1.2
-      - tqdm==4.66.1
-      - triton==2.1.0
-      - typing-extensions==4.8.0
-      - tzdata==2023.3
-      - urllib3==1.26.17
-      - xxhash==3.4.1
-      - yarl==1.9.2
-      - soundfile
-      - langid
-      - wget
-      - unidecode
-      - pyopenjtalk-prebuilt
-      - pypinyin
-      - inflect
-      - cn2an
-      - jieba
-      - eng_to_ipa
-      - openai-whisper
-      - matplotlib
-      - gradio==3.41.2
-      - nltk
-      - sudachipy
-      - sudachidict_core
-      - vocos
-      - vllm>=0.4.0
-      - transformers>=4.38.2  # Updated Version
-      - transformers_stream_generator==0.0.5
-      - xformers==0.0.23.post1  
-      - rerankers[transformers]
-      - pydantic
-prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers-rocm.yml
+++ b/backend/python/common-env/transformers/transformers-rocm.yml
@@ -1,113 +0,0 @@
-name: transformers
-channels:
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=5.1=1_gnu
-  - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2023.08.22=h06a4308_0
-  - ld_impl_linux-64=2.38=h1181459_1
-  - libffi=3.4.4=h6a678d5_0
-  - libgcc-ng=11.2.0=h1234567_1
-  - libgomp=11.2.0=h1234567_1
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - libuuid=1.41.5=h5eee18b_0
-  - ncurses=6.4=h6a678d5_0
-  - openssl=3.0.11=h7f8727e_2
-  - pip=23.2.1=py311h06a4308_0
-  - python=3.11.5=h955ad1f_0
-  - readline=8.2=h5eee18b_0
-  - setuptools=68.0.0=py311h06a4308_0
-  - sqlite=3.41.2=h5eee18b_0
-  - tk=8.6.12=h1ccaba5_0
-  - wheel=0.41.2=py311h06a4308_0
-  - xz=5.4.2=h5eee18b_0
-  - zlib=1.2.13=h5eee18b_0
-  - pip:
-      - --pre
-      - --extra-index-url https://download.pytorch.org/whl/nightly/
-      - accelerate==0.27.0
-      - auto-gptq==0.7.1
-      - aiohttp==3.8.5
-      - aiosignal==1.3.1
-      - async-timeout==4.0.3
-      - attrs==23.1.0
-      - bark==0.1.5
-      - boto3==1.28.61
-      - botocore==1.31.61
-      - certifi==2023.7.22
-      - TTS==0.22.0
-      - charset-normalizer==3.3.0
-      - datasets==2.14.5
-      - sentence-transformers==2.5.1 # Updated Version
-      - sentencepiece==0.1.99
-      - dill==0.3.7
-      - einops==0.7.0
-      - encodec==0.1.1
-      - filelock==3.12.4
-      - frozenlist==1.4.0
-      - fsspec==2023.6.0
-      - funcy==2.0
-      - grpcio==1.63.0
-      - huggingface-hub
-      - idna==3.4
-      - jinja2==3.1.2
-      - jmespath==1.0.1
-      - markupsafe==2.1.3
-      - mpmath==1.3.0
-      - multidict==6.0.4
-      - multiprocess==0.70.15
-      - networkx
-      - numpy==1.26.0
-      - packaging==23.2
-      - pandas
-      - peft==0.5.0
-      - protobuf==4.24.4
-      - psutil==5.9.5
-      - pyarrow==13.0.0
-      - python-dateutil==2.8.2
-      - pytz==2023.3.post1
-      - pyyaml==6.0.1
-      - regex==2023.10.3
-      - requests==2.31.0
-      - rouge==1.0.1
-      - s3transfer==0.7.0
-      - safetensors>=0.4.1
-      - scipy==1.12.0 # Updated Version
-      - six==1.16.0
-      - sympy==1.12
-      - tokenizers
-      - torch
-      - torchaudio
-      - tqdm==4.66.1
-      - triton==2.1.0
-      - typing-extensions==4.8.0
-      - tzdata==2023.3
-      - urllib3==1.26.17
-      - xxhash==3.4.1
-      - yarl==1.9.2
-      - soundfile
-      - langid
-      - wget
-      - unidecode
-      - optimum==1.17.1
-      - pyopenjtalk-prebuilt
-      - pypinyin
-      - inflect
-      - cn2an
-      - jieba
-      - eng_to_ipa
-      - openai-whisper
-      - matplotlib
-      - gradio==3.41.2
-      - nltk
-      - sudachipy
-      - sudachidict_core
-      - vocos
-      - vllm>=0.4.0
-      - transformers>=4.38.2  # Updated Version
-      - transformers_stream_generator==0.0.5
-      - xformers==0.0.23.post1
-      - rerankers[transformers]
-      - pydantic
-prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers.yml
+++ b/backend/python/common-env/transformers/transformers.yml
@@ -1,118 +0,0 @@
-name: transformers
-channels:
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=5.1=1_gnu
-  - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2023.08.22=h06a4308_0
-  - ld_impl_linux-64=2.38=h1181459_1
-  - libffi=3.4.4=h6a678d5_0
-  - libgcc-ng=11.2.0=h1234567_1
-  - libgomp=11.2.0=h1234567_1
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - libuuid=1.41.5=h5eee18b_0
-  - ncurses=6.4=h6a678d5_0
-  - openssl=3.0.11=h7f8727e_2
-  - pip=23.2.1=py311h06a4308_0
-  - python=3.11.5=h955ad1f_0
-  - readline=8.2=h5eee18b_0
-  - setuptools=68.0.0=py311h06a4308_0
-  - sqlite=3.41.2=h5eee18b_0
-  - tk=8.6.12=h1ccaba5_0
-  - wheel=0.41.2=py311h06a4308_0
-  - xz=5.4.2=h5eee18b_0
-  - zlib=1.2.13=h5eee18b_0
-  - pip:
-      - accelerate==0.27.0
-      - aiohttp==3.8.5
-      - aiosignal==1.3.1
-      - auto-gptq==0.7.1
-      - async-timeout==4.0.3
-      - attrs==23.1.0
-      - bark==0.1.5
-      - boto3==1.28.61
-      - botocore==1.31.61
-      - certifi==2023.7.22
-      - coloredlogs==15.0.1
-      - TTS==0.22.0
-      - charset-normalizer==3.3.0
-      - datasets==2.14.5
-      - sentence-transformers==2.5.1 # Updated Version
-      - sentencepiece==0.1.99
-      - dill==0.3.7
-      - einops==0.7.0
-      - encodec==0.1.1
-      - filelock==3.12.4
-      - frozenlist==1.4.0
-      - fsspec==2023.6.0
-      - funcy==2.0
-      - grpcio==1.63.0
-      - huggingface-hub
-      - humanfriendly==10.0
-      - idna==3.4
-      - jinja2==3.1.2
-      - jmespath==1.0.1
-      - markupsafe==2.1.3
-      - mpmath==1.3.0
-      - multidict==6.0.4
-      - multiprocess==0.70.15
-      - networkx
-      - numpy==1.26.0
-      - onnx==1.15.0
-      - openvino==2024.1.0
-      - openvino-telemetry==2024.1.0
-      - optimum[openvino]==1.19.1
-      - optimum-intel==1.16.1
-      - packaging==23.2
-      - pandas
-      - peft==0.5.0
-      - protobuf==4.24.4
-      - psutil==5.9.5
-      - pyarrow==13.0.0
-      - python-dateutil==2.8.2
-      - pytz==2023.3.post1
-      - pyyaml==6.0.1
-      - regex==2023.10.3
-      - requests==2.31.0
-      - rouge==1.0.1
-      - s3transfer==0.7.0
-      - safetensors>=0.4.1
-      - scipy==1.12.0 # Updated Version
-      - six==1.16.0
-      - sympy==1.12
-      - tokenizers
-      - torch==2.1.2
-      - torchvision==0.16.2
-      - torchaudio==2.1.2
-      - tqdm==4.66.1
-      - triton==2.1.0
-      - typing-extensions==4.8.0
-      - tzdata==2023.3
-      - urllib3==1.26.17
-      - xxhash==3.4.1
-      - yarl==1.9.2
-      - soundfile
-      - langid
-      - wget
-      - unidecode
-      - pyopenjtalk-prebuilt
-      - pypinyin
-      - inflect
-      - cn2an
-      - jieba
-      - eng_to_ipa
-      - openai-whisper
-      - matplotlib
-      - gradio==3.41.2
-      - nltk
-      - sudachipy
-      - sudachidict_core
-      - vocos
-      - vllm>=0.4.0
-      - transformers>=4.38.2  # Updated Version
-      - transformers_stream_generator==0.0.5
-      - xformers==0.0.23.post1
-      - rerankers[transformers]
-      - pydantic
-prefix: /opt/conda/envs/transformers
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -0,0 +1,213 @@
+
+
+# init handles the setup of the library
+# 
+# use the library by adding the following line to a script:
+# source $(dirname $0)/../common/libbackend.sh
+#
+# If you want to limit what targets a backend can be used on, set the variable LIMIT_TARGETS to a
+# space separated list of valid targets BEFORE sourcing the library, for example to only allow a backend
+# to be used on CUDA and CPU backends:
+#
+# LIMIT_TARGETS="cublas cpu"
+# source $(dirname $0)/../common/libbackend.sh
+#
+# You can use any valid BUILD_TYPE or BUILD_PROFILE, if you need to limit a backend to CUDA 12 only:
+#
+# LIMIT_TARGETS="cublas12"
+# source $(dirname $0)/../common/libbackend.sh
+#
+function init() {
+    BACKEND_NAME=${PWD##*/}
+    MY_DIR=$(realpath `dirname $0`)
+    BUILD_PROFILE=$(getBuildProfile)
+
+    # If a backend has defined a list of valid build profiles...
+    if [ ! -z "${LIMIT_TARGETS}" ]; then
+        isValidTarget=$(checkTargets ${LIMIT_TARGETS})
+        if [ ${isValidTarget} != true ]; then
+            echo "${BACKEND_NAME} can only be used on the following targets: ${LIMIT_TARGETS}"
+            exit 0
+        fi
+    fi
+
+    echo "Initializing libbackend for ${BACKEND_NAME}"
+}
+
+# getBuildProfile will inspect the system to determine which build profile is appropriate:
+# returns one of the following:
+# - cublas11
+# - cublas12
+# - hipblas
+# - intel
+function getBuildProfile() {
+    # First check if we are a cublas build, and if so report the correct build profile
+    if [ x"${BUILD_TYPE}" == "xcublas" ]; then
+        if [ ! -z ${CUDA_MAJOR_VERSION} ]; then
+            # If we have been given a CUDA version, we trust it
+            echo ${BUILD_TYPE}${CUDA_MAJOR_VERSION}
+        else
+            # We don't know what version of cuda we are, so we report ourselves as a generic cublas
+            echo ${BUILD_TYPE}
+        fi
+        return 0
+    fi
+
+    # If /opt/intel exists, then we are doing an intel/ARC build
+    if [ -d "/opt/intel" ]; then
+        echo "intel"
+        return 0
+    fi
+
+    # If for any other values of BUILD_TYPE, we don't need any special handling/discovery
+    if [ ! -z ${BUILD_TYPE} ]; then
+        echo ${BUILD_TYPE}
+        return 0
+    fi
+
+    # If there is no BUILD_TYPE set at all, set a build-profile value of CPU, we aren't building for any GPU targets
+    echo "cpu"
+}
+
+# ensureVenv makes sure that the venv for the backend both exists, and is activated.
+#
+# This function is idempotent, so you can call it as many times as you want and it will
+# always result in an activated virtual environment
+function ensureVenv() {
+    if [ ! -d "${MY_DIR}/venv" ]; then
+        uv venv ${MY_DIR}/venv
+        echo "virtualenv created"
+    fi
+    
+    if [ "x${VIRTUAL_ENV}" != "x${MY_DIR}/venv" ]; then
+        source ${MY_DIR}/venv/bin/activate
+        echo "virtualenv activated"
+    fi
+
+    echo "activated virtualenv has been ensured"
+}
+
+# installRequirements looks for several requirements files and if they exist runs the install for them in order
+#
+#  - requirements-install.txt
+#  - requirements.txt
+#  - requirements-${BUILD_TYPE}.txt
+#  - requirements-${BUILD_PROFILE}.txt
+#
+# BUILD_PROFILE is a pore specific version of BUILD_TYPE, ex: cuda11 or cuda12
+# it can also include some options that we do not have BUILD_TYPES for, ex: intel
+#
+# NOTE: for BUILD_PROFILE==intel, this function does NOT automatically use the Intel python package index.
+# you may want to add the following line to a requirements-intel.txt if you use one:
+#
+# --index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+#
+# If you need to add extra flags into the pip install command you can do so by setting the variable EXTRA_PIP_INSTALL_FLAGS
+# before calling installRequirements.  For example:
+#
+# source $(dirname $0)/../common/libbackend.sh
+# EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
+# installRequirements
+function installRequirements() {
+    ensureVenv
+
+    # These are the requirements files we will attempt to install, in order
+    declare -a requirementFiles=(
+        "${MY_DIR}/requirements-install.txt"
+        "${MY_DIR}/requirements.txt"
+        "${MY_DIR}/requirements-${BUILD_TYPE}.txt"
+    )
+
+    if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
+        requirementFiles+=("${MY_DIR}/requirements-${BUILD_PROFILE}.txt")
+    fi
+
+    for reqFile in ${requirementFiles[@]}; do
+        if [ -f ${reqFile} ]; then
+            echo "starting requirements install for ${reqFile}"
+            uv pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile}
+            echo "finished requirements install for ${reqFile}"
+        fi
+    done
+}
+
+# startBackend discovers and runs the backend GRPC server
+#
+# You can specify a specific backend file to execute by setting BACKEND_FILE before calling startBackend.
+# example:
+#
+# source ../common/libbackend.sh
+# BACKEND_FILE="${MY_DIR}/source/backend.py"
+# startBackend $@
+#
+# valid filenames for autodiscovered backend servers are:
+#  - server.py
+#  - backend.py
+#  - ${BACKEND_NAME}.py
+function startBackend() {
+    ensureVenv
+
+    if [ ! -z ${BACKEND_FILE} ]; then
+        python ${BACKEND_FILE} $@
+    elif [ -e "${MY_DIR}/server.py" ]; then
+        python ${MY_DIR}/server.py $@
+    elif [ -e "${MY_DIR}/backend.py" ]; then
+        python ${MY_DIR}/backend.py $@
+    elif [ -e "${MY_DIR}/${BACKEND_NAME}.py" ]; then
+        python ${MY_DIR}/${BACKEND_NAME}.py $@
+    fi
+}
+
+# runUnittests discovers and runs python unittests
+#
+# You can specify a specific test file to use by setting TEST_FILE before calling runUnittests.
+# example:
+#
+# source ../common/libbackend.sh
+# TEST_FILE="${MY_DIR}/source/test.py"
+# runUnittests $@
+#
+# be default a file named test.py in the backends directory will be used
+function runUnittests() {
+    ensureVenv
+
+    if [ ! -z ${TEST_FILE} ]; then
+        testDir=$(dirname `realpath ${TEST_FILE}`)
+        testFile=$(basename ${TEST_FILE})
+        pushd ${testDir}
+        python -m unittest ${testFile}
+        popd
+    elif [ -f "${MY_DIR}/test.py" ]; then
+        pushd ${MY_DIR}
+        python -m unittest test.py
+        popd
+    else
+        echo "no tests defined for ${BACKEND_NAME}"
+    fi
+}
+
+##################################################################################
+# Below here are helper functions not intended to be used outside of the library #
+##################################################################################
+
+# checkTargets determines if the current BUILD_TYPE or BUILD_PROFILE is in a list of valid targets
+function checkTargets() {
+    # Collect all provided targets into a variable and...
+    targets=$@
+    # ...convert it into an array
+    declare -a targets=($targets)
+
+    for target in ${targets[@]}; do
+        if [ "x${BUILD_TYPE}" == "x${target}" ]; then
+            echo true
+            return 0
+        fi
+        if [ "x${BUILD_PROFILE}" == "x${target}" ]; then
+            echo true
+            return 0
+        fi
+    done
+    echo false
+}
+
+init
--- a/backend/python/common/template/Makefile
+++ b/backend/python/common/template/Makefile
@@ -0,0 +1,19 @@
+.DEFAULT_GOAL := install
+
+.PHONY: install
+install: protogen
+	bash install.sh
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
--- a/backend/python/common/template/backend.py
+++ b/backend/python/common/template/backend.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python3
+import grpc
+import backend_pb2
+import backend_pb2_grpc
--- a/backend/python/common/template/install.sh
+++ b/backend/python/common/template/install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
+# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
+# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
+# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+if [ "x${BUILD_PROFILE}" == "xintel" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+fi
+
+installRequirements
--- a/backend/python/common/template/requirements-intel.txt
+++ b/backend/python/common/template/requirements-intel.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+intel-extension-for-pytorch
+torch
+optimum[openvino]
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -0,0 +1,2 @@
+grpcio==1.64.0
+protobuf
--- a/backend/python/common/template/run.sh
+++ b/backend/python/common/template/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+source $(dirname $0)/../common/libbackend.sh
+
+startBackend $@
--- a/backend/python/common/template/test.sh
+++ b/backend/python/common/template/test.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+runUnittests
--- a/backend/python/coqui/Makefile
+++ b/backend/python/coqui/Makefile
@@ -1,6 +1,6 @@
 .PHONY: coqui
 coqui: protogen
-	$(MAKE) -C ../common-env/transformers
+	bash install.sh

 .PHONY: run
 run: protogen
@@ -22,4 +22,8 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

 backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
--- a/backend/python/coqui/coqui_server.py
+++ b/backend/python/coqui/coqui_server.py
--- a/backend/python/coqui/install.sh
+++ b/backend/python/coqui/install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
+# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
+# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
+# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+if [ "x${BUILD_PROFILE}" == "xintel" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+fi
+
+installRequirements
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+intel-extension-for-pytorch
+torch
+torchaudio
+optimum[openvino]
+setuptools
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -0,0 +1,6 @@
+accelerate
+TTS==0.22.0
+grpcio==1.64.0
+protobuf
+certifi
+transformers
--- a/backend/python/coqui/run.sh
+++ b/backend/python/coqui/run.sh
@@ -1,14 +1,4 @@
 #!/bin/bash
+source $(dirname $0)/../common/libbackend.sh

-##
-## A bash script wrapper that runs the ttsbark server with conda
-
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python $DIR/coqui_server.py $@
+startBackend $@
--- a/backend/python/coqui/test.py
+++ b/backend/python/coqui/test.py
@@ -18,7 +18,7 @@ class TestBackendServicer(unittest.TestCase):
        """
        This method sets up the gRPC service by starting the server
        """
-        self.service = subprocess.Popen(["python3", "coqui_server.py", "--addr", "localhost:50051"])
+        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
        time.sleep(10)

    def tearDown(self) -> None:
--- a/backend/python/coqui/test.sh
+++ b/backend/python/coqui/test.sh
@@ -1,11 +1,6 @@
 #!/bin/bash
-##
-## A bash script wrapper that runs the bark server with conda
+set -e

-# Activate conda environment
-source activate transformers
+source $(dirname $0)/../common/libbackend.sh

-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python -m unittest $DIR/test.py
+runUnittests
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -13,8 +13,7 @@ endif

 .PHONY: diffusers
 diffusers: protogen
-	@echo "Installing $(CONDA_ENV_PATH)..."
-	bash install.sh $(CONDA_ENV_PATH)
+	bash install.sh

 .PHONY: run
 run: protogen
@@ -33,4 +32,8 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

 backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
--- a/backend/python/diffusers/backend_diffusers.py
+++ b/backend/python/diffusers/backend_diffusers.py
--- a/backend/python/diffusers/diffusers-rocm.yml
+++ b/backend/python/diffusers/diffusers-rocm.yml
@@ -1,65 +0,0 @@
-name: diffusers
-channels:
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=5.1=1_gnu
-  - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2023.08.22=h06a4308_0
-  - ld_impl_linux-64=2.38=h1181459_1
-  - libffi=3.4.4=h6a678d5_0
-  - libgcc-ng=11.2.0=h1234567_1
-  - libgomp=11.2.0=h1234567_1
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - libuuid=1.41.5=h5eee18b_0
-  - ncurses=6.4=h6a678d5_0
-  - openssl=3.0.11=h7f8727e_2
-  - pip=23.2.1=py311h06a4308_0
-  - python=3.11.5=h955ad1f_0
-  - readline=8.2=h5eee18b_0
-  - setuptools=68.0.0=py311h06a4308_0
-  - sqlite=3.41.2=h5eee18b_0
-  - tk=8.6.12=h1ccaba5_0
-  - tzdata=2023c=h04d1e81_0
-  - wheel=0.41.2=py311h06a4308_0
-  - xz=5.4.2=h5eee18b_0
-  - zlib=1.2.13=h5eee18b_0
-  - pip:
-      - --pre
-      - --extra-index-url https://download.pytorch.org/whl/nightly/
-      - accelerate>=0.11.0
-      - certifi==2023.7.22
-      - charset-normalizer==3.3.0
-      - compel==2.0.2
-      - diffusers==0.24.0
-      - filelock==3.12.4
-      - fsspec==2023.9.2
-      - grpcio==1.63.0
-      - huggingface-hub>=0.19.4
-      - idna==3.4
-      - importlib-metadata==6.8.0
-      - jinja2==3.1.2
-      - markupsafe==2.1.3
-      - mpmath==1.3.0
-      - networkx==3.1
-      - numpy==1.26.0
-      - omegaconf
-      - packaging==23.2
-      - pillow==10.0.1
-      - protobuf==4.24.4
-      - psutil==5.9.5
-      - pyparsing==3.1.1
-      - pyyaml==6.0.1
-      - regex==2023.10.3
-      - requests==2.31.0
-      - safetensors==0.4.0
-      - sympy==1.12
-      - tqdm==4.66.1
-      - transformers>=4.25.1
-      - triton==2.1.0
-      - typing-extensions==4.8.0
-      - urllib3==2.0.6
-      - zipp==3.17.0
-      - torch
-      - opencv-python
-prefix: /opt/conda/envs/diffusers
--- a/backend/python/diffusers/diffusers.yml
+++ b/backend/python/diffusers/diffusers.yml
@@ -1,75 +0,0 @@
-name: diffusers
-channels:
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=5.1=1_gnu
-  - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2023.08.22=h06a4308_0
-  - ld_impl_linux-64=2.38=h1181459_1
-  - libffi=3.4.4=h6a678d5_0
-  - libgcc-ng=11.2.0=h1234567_1
-  - libgomp=11.2.0=h1234567_1
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - libuuid=1.41.5=h5eee18b_0
-  - ncurses=6.4=h6a678d5_0
-  - openssl=3.0.11=h7f8727e_2
-  - pip=23.2.1=py311h06a4308_0
-  - python=3.11.5=h955ad1f_0
-  - readline=8.2=h5eee18b_0
-  - setuptools=68.0.0=py311h06a4308_0
-  - sqlite=3.41.2=h5eee18b_0
-  - tk=8.6.12=h1ccaba5_0
-  - tzdata=2023c=h04d1e81_0
-  - wheel=0.41.2=py311h06a4308_0
-  - xz=5.4.2=h5eee18b_0
-  - zlib=1.2.13=h5eee18b_0
-  - pip:
-      - accelerate>=0.11.0
-      - certifi==2023.7.22
-      - charset-normalizer==3.3.0
-      - compel==2.0.2
-      - diffusers==0.24.0
-      - filelock==3.12.4
-      - fsspec==2023.9.2
-      - grpcio==1.63.0
-      - huggingface-hub>=0.19.4
-      - idna==3.4
-      - importlib-metadata==6.8.0
-      - jinja2==3.1.2
-      - markupsafe==2.1.3
-      - mpmath==1.3.0
-      - networkx==3.1
-      - numpy==1.26.0
-      - nvidia-cublas-cu12==12.1.3.1
-      - nvidia-cuda-cupti-cu12==12.1.105
-      - nvidia-cuda-nvrtc-cu12==12.1.105
-      - nvidia-cuda-runtime-cu12==12.1.105
-      - nvidia-cudnn-cu12==8.9.2.26
-      - nvidia-cufft-cu12==11.0.2.54
-      - nvidia-curand-cu12==10.3.2.106
-      - nvidia-cusolver-cu12==11.4.5.107
-      - nvidia-cusparse-cu12==12.1.0.106
-      - nvidia-nccl-cu12==2.18.1
-      - nvidia-nvjitlink-cu12==12.2.140
-      - nvidia-nvtx-cu12==12.1.105
-      - omegaconf
-      - packaging==23.2
-      - pillow==10.0.1
-      - protobuf==4.24.4
-      - psutil==5.9.5
-      - pyparsing==3.1.1
-      - pyyaml==6.0.1
-      - regex==2023.10.3
-      - requests==2.31.0
-      - safetensors==0.4.0
-      - sympy==1.12
-      - torch==2.1.0
-      - tqdm==4.66.1
-      - transformers>=4.25.1
-      - triton==2.1.0
-      - typing-extensions==4.8.0
-      - urllib3==2.0.6
-      - zipp==3.17.0
-      - opencv-python
-prefix: /opt/conda/envs/diffusers
--- a/backend/python/diffusers/install.sh
+++ b/backend/python/diffusers/install.sh
@@ -1,50 +1,14 @@
 #!/bin/bash
-set -ex
+set -e

-SKIP_CONDA=${SKIP_CONDA:-0}
+source $(dirname $0)/../common/libbackend.sh

-# Check if environment exist
-conda_env_exists(){
-    ! conda list --name "${@}" >/dev/null 2>/dev/null
-}
-
-if [ $SKIP_CONDA -eq 1 ]; then
-    echo "Skipping conda environment installation"
-else
-    export PATH=$PATH:/opt/conda/bin
-    if conda_env_exists "diffusers" ; then
-        echo "Creating virtual environment..."
-        conda env create --name diffusers --file $1
-        echo "Virtual environment created."
-    else 
-        echo "Virtual environment already exists."
-    fi
+# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
+# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
+# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
+# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+if [ "x${BUILD_PROFILE}" == "xintel" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi

-if [ -d "/opt/intel" ]; then
-    # Intel GPU: If the directory exists, we assume we are using the Intel image
-    # https://github.com/intel/intel-extension-for-pytorch/issues/538
-    pip install torch==2.1.0a0 \
-                torchvision==0.16.0a0 \
-                torchaudio==2.1.0a0 \
-                intel-extension-for-pytorch==2.1.10+xpu \
-                --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-    
-    pip install google-api-python-client \
-                grpcio==1.63.0 \
-                grpcio-tools==1.63.0 \
-                diffusers==0.24.0 \
-                transformers>=4.25.1 \
-                accelerate \
-                compel==2.0.2 \
-                Pillow
-fi
-
-if [ "$PIP_CACHE_PURGE" = true ] ; then
-    if [ $SKIP_CONDA -ne 1 ]; then
-        # Activate conda environment
-        source activate diffusers
-    fi
-
-    pip cache purge
-fi
+installRequirements
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+intel-extension-for-pytorch
+torch
+torchvision
+optimum[openvino]
+setuptools
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -0,0 +1,10 @@
+accelerate
+compel
+diffusers
+grpcio==1.64.0
+opencv-python
+pillow
+protobuf
+torch
+transformers
+certifi
--- a/backend/python/diffusers/run.sh
+++ b/backend/python/diffusers/run.sh
@@ -1,19 +1,4 @@
 #!/bin/bash
+source $(dirname $0)/../common/libbackend.sh

-##
-## A bash script wrapper that runs the diffusers server with conda
-
-if [ -d "/opt/intel" ]; then
-    # Assumes we are using the Intel oneAPI container image
-    # https://github.com/intel/intel-extension-for-pytorch/issues/538
-    export XPU=1
-else
-    export PATH=$PATH:/opt/conda/bin
-    # Activate conda environment
-    source activate diffusers
-fi
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python $DIR/backend_diffusers.py $@
+startBackend $@
--- a/backend/python/diffusers/test.py
+++ b/backend/python/diffusers/test.py
@@ -18,7 +18,7 @@ class TestBackendServicer(unittest.TestCase):
        """
        This method sets up the gRPC service by starting the server
        """
-        self.service = subprocess.Popen(["python3", "backend_diffusers.py", "--addr", "localhost:50051"])
+        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])

    def tearDown(self) -> None:
        """
--- a/backend/python/diffusers/test.sh
+++ b/backend/python/diffusers/test.sh
@@ -1,14 +1,6 @@
 #!/bin/bash
+set -e

-##
-## A bash script wrapper that runs the diffusers server with conda
+source $(dirname $0)/../common/libbackend.sh

-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate diffusers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python -m unittest $DIR/test.py
+runUnittests
--- a/backend/python/exllama/.gitignore
+++ b/backend/python/exllama/.gitignore
@@ -0,0 +1 @@
+source
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -18,4 +18,8 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

 backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	$(RM) -r venv source __pycache__
--- a/backend/python/exllama/backend.py
+++ b/backend/python/exllama/backend.py
@@ -14,9 +14,9 @@ import torch
 import torch.nn.functional as F
 from torch import version as torch_version

-from tokenizer import ExLlamaTokenizer
-from generator import ExLlamaGenerator
-from model import ExLlama, ExLlamaCache, ExLlamaConfig
+from source.tokenizer import ExLlamaTokenizer
+from source.generator import ExLlamaGenerator
+from source.model import ExLlama, ExLlamaCache, ExLlamaConfig

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

--- a/backend/python/exllama/exllama.yml
+++ b/backend/python/exllama/exllama.yml
@@ -1,56 +0,0 @@
-name: exllama
-channels:
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=5.1=1_gnu
-  - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2023.08.22=h06a4308_0
-  - ld_impl_linux-64=2.38=h1181459_1
-  - libffi=3.4.4=h6a678d5_0
-  - libgcc-ng=11.2.0=h1234567_1
-  - libgomp=11.2.0=h1234567_1
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - libuuid=1.41.5=h5eee18b_0
-  - ncurses=6.4=h6a678d5_0
-  - openssl=3.0.11=h7f8727e_2
-  - pip=23.2.1=py311h06a4308_0
-  - python=3.11.5=h955ad1f_0
-  - readline=8.2=h5eee18b_0
-  - setuptools=68.0.0=py311h06a4308_0
-  - sqlite=3.41.2=h5eee18b_0
-  - tk=8.6.12=h1ccaba5_0
-  - tzdata=2023c=h04d1e81_0
-  - wheel=0.41.2=py311h06a4308_0
-  - xz=5.4.2=h5eee18b_0
-  - zlib=1.2.13=h5eee18b_0
-  - pip:
-      - filelock==3.12.4
-      - fsspec==2023.9.2
-      - grpcio==1.63.0
-      - jinja2==3.1.2
-      - markupsafe==2.1.3
-      - mpmath==1.3.0
-      - networkx==3.1
-      - ninja==1.11.1
-      - protobuf==4.24.4
-      - nvidia-cublas-cu12==12.1.3.1
-      - nvidia-cuda-cupti-cu12==12.1.105
-      - nvidia-cuda-nvrtc-cu12==12.1.105
-      - nvidia-cuda-runtime-cu12==12.1.105
-      - nvidia-cudnn-cu12==8.9.2.26
-      - nvidia-cufft-cu12==11.0.2.54
-      - nvidia-curand-cu12==10.3.2.106
-      - nvidia-cusolver-cu12==11.4.5.107
-      - nvidia-cusparse-cu12==12.1.0.106
-      - nvidia-nccl-cu12==2.18.1
-      - nvidia-nvjitlink-cu12==12.2.140
-      - nvidia-nvtx-cu12==12.1.105
-      - safetensors==0.3.2
-      - sentencepiece==0.1.99
-      - sympy==1.12
-      - torch==2.1.0
-      - triton==2.1.0
-      - typing-extensions==4.8.0
-      - numpy
-prefix: /opt/conda/envs/exllama
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@@ -1,32 +1,13 @@
 #!/bin/bash
-set -ex
+set -e

-export PATH=$PATH:/opt/conda/bin
+LIMIT_TARGETS="cublas"

-if [ "$BUILD_TYPE" != "cublas" ]; then
-    echo "[exllama] Attention!!! Nvidia GPU is required - skipping installation"
-    exit 0
-fi
+source $(dirname $0)/../common/libbackend.sh

-# Check if environment exist
-conda_env_exists(){
-    ! conda list --name "${@}" >/dev/null 2>/dev/null
-}
+installRequirements

-if conda_env_exists "exllama" ; then
-    echo "Creating virtual environment..."
-    conda env create --name exllama --file $1
-    echo "Virtual environment created."
-else
-    echo "Virtual environment already exists."
-fi
+git clone https://github.com/turboderp/exllama $MY_DIR/source
+uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt

-source activate exllama
-
-git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd
-
-cp -rfv $CONDA_PREFIX/exllama/* ./
-
-if [ "$PIP_CACHE_PURGE" = true ] ; then
-    pip cache purge
-fi
+cp -v ./*py $MY_DIR/source/
--- a/backend/python/exllama/requirements.txt
+++ b/backend/python/exllama/requirements.txt
@@ -0,0 +1,6 @@
+grpcio==1.64.0
+protobuf
+torch
+transformers
+certifi
+setuptools
--- a/backend/python/exllama/run.sh
+++ b/backend/python/exllama/run.sh
@@ -1,15 +1,7 @@
 #!/bin/bash
+LIMIT_TARGETS="cublas"
+BACKEND_FILE="${MY_DIR}/source/backend.py"

-##
-## A bash script wrapper that runs the exllama server with conda
-export PATH=$PATH:/opt/conda/bin
+source $(dirname $0)/../common/libbackend.sh

-# Activate conda environment
-source activate exllama
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-cd $DIR
-
-python $DIR/exllama.py $@
+startBackend $@
--- a/backend/python/exllama/test.sh
+++ b/backend/python/exllama/test.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+runUnittests
--- a/backend/python/exllama2/.gitignore
+++ b/backend/python/exllama2/.gitignore
@@ -0,0 +1 @@
+source
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -1,6 +1,5 @@
 .PHONY: exllama2
 exllama2: protogen
-	$(MAKE) -C ../common-env/transformers
 	bash install.sh

 .PHONY: run
@@ -17,4 +16,8 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

 backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	$(RM) -r venv source __pycache__
--- a/backend/python/exllama2/exllama2_backend.py
+++ b/backend/python/exllama2/exllama2_backend.py
--- a/backend/python/exllama2/exllama2.yml
+++ b/backend/python/exllama2/exllama2.yml
@@ -1,57 +0,0 @@
-name: exllama2
-channels:
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=5.1=1_gnu
-  - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2023.08.22=h06a4308_0
-  - ld_impl_linux-64=2.38=h1181459_1
-  - libffi=3.4.4=h6a678d5_0
-  - libgcc-ng=11.2.0=h1234567_1
-  - libgomp=11.2.0=h1234567_1
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - libuuid=1.41.5=h5eee18b_0
-  - ncurses=6.4=h6a678d5_0
-  - openssl=3.0.11=h7f8727e_2
-  - pip=23.2.1=py311h06a4308_0
-  - python=3.11.5=h955ad1f_0
-  - readline=8.2=h5eee18b_0
-  - setuptools=68.0.0=py311h06a4308_0
-  - sqlite=3.41.2=h5eee18b_0
-  - tk=8.6.12=h1ccaba5_0
-  - tzdata=2023c=h04d1e81_0
-  - wheel=0.41.2=py311h06a4308_0
-  - xz=5.4.2=h5eee18b_0
-  - zlib=1.2.13=h5eee18b_0
-  - pip:
-      - filelock==3.12.4
-      - fsspec==2023.9.2
-      - grpcio==1.63.0
-      - markupsafe==2.1.3
-      - mpmath==1.3.0
-      - networkx==3.1
-      - protobuf==4.24.4
-      - nvidia-cublas-cu12==12.1.3.1
-      - nvidia-cuda-cupti-cu12==12.1.105
-      - nvidia-cuda-nvrtc-cu12==12.1.105
-      - nvidia-cuda-runtime-cu12==12.1.105
-      - nvidia-cudnn-cu12==8.9.2.26
-      - nvidia-cufft-cu12==11.0.2.54
-      - nvidia-curand-cu12==10.3.2.106
-      - nvidia-cusolver-cu12==11.4.5.107
-      - nvidia-cusparse-cu12==12.1.0.106
-      - nvidia-nccl-cu12==2.18.1
-      - nvidia-nvjitlink-cu12==12.2.140
-      - nvidia-nvtx-cu12==12.1.105
-      - pandas
-      - numpy
-      - ninja
-      - fastparquet
-      - torch>=2.1.0
-      - safetensors>=0.3.2
-      - sentencepiece>=0.1.97
-      - pygments
-      - websockets
-      - regex
-prefix: /opt/conda/envs/exllama2
--- a/backend/python/exllama2/install.sh
+++ b/backend/python/exllama2/install.sh
@@ -1,32 +1,16 @@
 #!/bin/bash
 set -e
-##
-## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export SHA=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f

-if [ "$BUILD_TYPE" != "cublas" ]; then
-    echo "[exllamav2] Attention!!! Nvidia GPU is required - skipping installation"
-    exit 0
-fi
+LIMIT_TARGETS="cublas"
+EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
+EXLLAMA2_VERSION=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f

-export PATH=$PATH:/opt/conda/bin
-source activate transformers
+source $(dirname $0)/../common/libbackend.sh

-echo $CONDA_PREFIX
+installRequirements

-git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2
+git clone https://github.com/turboderp/exllamav2 $MY_DIR/source
+pushd ${MY_DIR}/source && git checkout -b build ${EXLLAMA2_VERSION} && popd

-pushd $CONDA_PREFIX/exllamav2
-
-git checkout -b build $SHA
-
-# TODO: this needs to be pinned within the conda environments
-pip install -r requirements.txt
-
-popd
-
-cp -rfv $CONDA_PREFIX/exllamav2/* ./  
-
-if [ "$PIP_CACHE_PURGE" = true ] ; then
-    pip cache purge
-fi
+# This installs exllamav2 in JIT mode so it will compile the appropriate torch extension at runtime
+EXLLAMA_NOCOMPILE= uv pip install ${EXTRA_PIP_INSTALL_FLAGS} ${MY_DIR}/source/
--- a/backend/python/exllama2/requirements-install.txt
+++ b/backend/python/exllama2/requirements-install.txt
@@ -0,0 +1,4 @@
+# This is here to trigger the install script to add --no-build-isolation to the uv pip install commands
+# exllama2 does not specify it's build requirements per PEP517, so we need to provide some things ourselves
+wheel
+setuptools
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -0,0 +1,7 @@
+accelerate
+grpcio==1.64.0
+protobuf
+certifi
+torch
+wheel
+setuptools
--- a/backend/python/exllama2/run.sh
+++ b/backend/python/exllama2/run.sh
@@ -1,16 +1,6 @@
 #!/bin/bash
+LIMIT_TARGETS="cublas"

-##
-## A bash script wrapper that runs the exllama server with conda
+source $(dirname $0)/../common/libbackend.sh

-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-cd $DIR
-
-python $DIR/exllama2_backend.py $@
+startBackend $@
--- a/backend/python/exllama2/test.sh
+++ b/backend/python/exllama2/test.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+runUnittests
--- a/backend/python/mamba/Makefile
+++ b/backend/python/mamba/Makefile
@@ -1,7 +1,6 @@
 .PHONY: mamba
 mamba: protogen
-	$(MAKE) -C ../common-env/transformers
-	bash install.sh
+	bash install.sh 

 .PHONY: run
 run: protogen
@@ -23,4 +22,8 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

 backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	$(RM) -r venv __pycache__
--- a/backend/python/mamba/backend_mamba.py
+++ b/backend/python/mamba/backend_mamba.py
--- a/backend/python/mamba/install.sh
+++ b/backend/python/mamba/install.sh
@@ -1,22 +1,9 @@
 #!/bin/bash
 set -e
-##
-## A bash script installs the required dependencies of VALL-E-X and prepares the environment

-if [ "$BUILD_TYPE" != "cublas" ]; then
-    echo "[mamba] Attention!!! nvcc is required - skipping installation"
-    exit 0
-fi
+LIMIT_TARGETS="cublas"
+EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"

-export PATH=$PATH:/opt/conda/bin
+source $(dirname $0)/../common/libbackend.sh

-# Activate conda environment
-source activate transformers
-
-echo $CONDA_PREFIX
-
-pip install causal-conv1d==1.0.0 mamba-ssm==1.0.1
-
-if [ "$PIP_CACHE_PURGE" = true ] ; then
-    pip cache purge
-fi
+installRequirements
--- a/backend/python/mamba/requirements-install.txt
+++ b/backend/python/mamba/requirements-install.txt
@@ -0,0 +1,7 @@
+# mabma does not specify it's build dependencies per PEP517, so we need to disable build isolation
+# this also means that we need to install the basic build dependencies into the venv ourselves
+# https://github.com/Dao-AILab/causal-conv1d/issues/24
+packaging
+setuptools
+wheel
+torch==2.2.0
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -0,0 +1,6 @@
+causal-conv1d==1.2.0.post2
+mamba-ssm==1.2.0.post1
+grpcio==1.64.0
+protobuf
+certifi
+transformers
--- a/backend/python/mamba/run.sh
+++ b/backend/python/mamba/run.sh
@@ -1,14 +1,6 @@
 #!/bin/bash
+LIMIT_TARGETS="cublas"

-##
-## A bash script wrapper that runs the diffusers server with conda
+source $(dirname $0)/../common/libbackend.sh

-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python $DIR/backend_mamba.py $@
+startBackend $@
--- a/backend/python/mamba/test_backend_mamba.py
+++ b/backend/python/mamba/test_backend_mamba.py
@@ -20,7 +20,7 @@ class TestBackendServicer(unittest.TestCase):
    This class contains methods to test the startup and shutdown of the gRPC service.
    """
    def setUp(self):
-        self.service = subprocess.Popen(["python", "backend_vllm.py", "--addr", "localhost:50051"])
+        self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"])
        time.sleep(10)

    def tearDown(self) -> None:
--- a/backend/python/mamba/test.sh
+++ b/backend/python/mamba/test.sh
@@ -1,11 +1,6 @@
 #!/bin/bash
-##
-## A bash script wrapper that runs the transformers server with conda
+set -e

-# Activate conda environment
-source activate transformers
+source $(dirname $0)/../common/libbackend.sh

-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python -m unittest $DIR/test_backend_mamba.py
+runUnittests
--- a/backend/python/openvoice/Makefile
+++ b/backend/python/openvoice/Makefile
@@ -0,0 +1,25 @@
+.DEFAULT_GOAL := install
+
+.PHONY: install
+install: protogen
+	bash install.sh
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
+
+.PHONY: test
+test: protogen
+	@echo "Testing openvoice..."
+	bash test.sh
+	@echo "openvoice tested."
--- a/backend/python/openvoice/backend.py
+++ b/backend/python/openvoice/backend.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+"""
+Extra gRPC server for OpenVoice models.
+"""
+from concurrent import futures
+
+import argparse
+import signal
+import sys
+import os
+import torch
+from openvoice import se_extractor
+from openvoice.api import ToneColorConverter
+from melo.api import TTS
+
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer for the backend service.
+
+    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
+    """
+    def Health(self, request, context):
+        """
+        A gRPC method that returns the health status of the backend service.
+
+        Args:
+            request: A HealthRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Reply object that contains the health status of the backend service.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    def LoadModel(self, request, context):
+        """
+        A gRPC method that loads a model into memory.
+
+        Args:
+            request: A LoadModelRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Result object that contains the result of the LoadModel operation.
+        """
+        model_name = request.Model
+        try:
+
+            self.clonedVoice = False
+            # Assume directory from request.ModelFile.
+            # Only if request.LoraAdapter it's not an absolute path
+            if request.AudioPath and request.ModelFile != "" and not os.path.isabs(request.AudioPath):
+                # get base path of modelFile
+                modelFileBase = os.path.dirname(request.ModelFile)
+                request.AudioPath = os.path.join(modelFileBase, request.AudioPath)
+            if request.AudioPath != "":
+                self.clonedVoice = True
+
+            self.modelpath = request.ModelFile
+            self.speaker = request.Type
+            self.ClonedVoicePath = request.AudioPath
+            
+            ckpt_converter = request.Model+'/converter'
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            self.device = device
+            self.tone_color_converter = None
+            if self.clonedVoice:
+                self.tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
+                self.tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
+       
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def TTS(self, request, context):
+        model_name = request.model
+        if model_name == "":
+            return backend_pb2.Result(success=False, message="request.model is required")
+        try:
+            # Speed is adjustable
+            speed = 1.0
+            voice = "EN"
+            if request.voice:
+                voice = request.voice
+            model = TTS(language=voice, device=self.device)
+            speaker_ids = model.hps.data.spk2id
+            speaker_key = self.speaker
+            modelpath = self.modelpath
+            for s in speaker_ids.keys():
+                print(f"Speaker: {s} - ID: {speaker_ids[s]}")
+            speaker_id = speaker_ids[speaker_key]
+            speaker_key = speaker_key.lower().replace('_', '-')
+            source_se = torch.load(f'{modelpath}/base_speakers/ses/{speaker_key}.pth', map_location=self.device)
+            model.tts_to_file(request.text, speaker_id, request.dst, speed=speed)
+            if self.clonedVoice:
+                reference_speaker = self.ClonedVoicePath
+                target_se, audio_name = se_extractor.get_se(reference_speaker, self.tone_color_converter, vad=False)
+                # Run the tone color converter
+                encode_message = "@MyShell"
+                self.tone_color_converter.convert(
+                    audio_src_path=request.dst, 
+                    src_se=source_se, 
+                    tgt_se=target_se, 
+                    output_path=request.dst,
+                    message=encode_message)
+           
+            print("[OpenVoice] TTS generated!", file=sys.stderr)
+            print("[OpenVoice] TTS saved to", request.dst, file=sys.stderr)
+            print(request, file=sys.stderr)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(success=True)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("[OpenVoice] Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("[OpenVoice] Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+    print(f"[OpenVoice] startup: {args}", file=sys.stderr)
+    serve(args.addr)
--- a/backend/python/openvoice/install.sh
+++ b/backend/python/openvoice/install.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
+# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
+# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
+# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+if [ "x${BUILD_PROFILE}" == "xintel" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+fi
+
+installRequirements
+
+python -m unidic download
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -0,0 +1,23 @@
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+intel-extension-for-pytorch
+torch
+optimum[openvino]
+grpcio==1.64.0
+protobuf
+librosa==0.9.1
+faster-whisper==0.9.0
+pydub==0.25.1
+wavmark==0.0.3
+numpy==1.22.0
+eng_to_ipa==0.0.2
+inflect==7.0.0
+unidecode==1.3.7
+whisper-timestamped==1.14.2
+openai
+python-dotenv
+pypinyin==0.50.0
+cn2an==0.5.22
+jieba==0.42.1
+gradio==3.48.0
+langid==1.1.6
+git+https://github.com/myshell-ai/MeloTTS.git
--- a/Show More
+++ b/Show More