fixup: create piper libdir also when not built

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Merge branch 'master' into enable_gpu
2026-07-06 14:28:04 -04:00 · 2023-11-12 22:17:11 +01:00 · 2023-11-11 19:20:36 +01:00 · 2023-11-06 19:10:35 +01:00
310 changed files with 2154 additions and 17024 deletions
--- a/.env
+++ b/.env
@@ -69,21 +69,4 @@ MODELS_PATH=/models
 # PYTHON_GRPC_MAX_WORKERS=1

 ### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
-# LLAMACPP_PARALLEL=1
-
-### Enable to run parallel requests
-# PARALLEL_REQUESTS=true
-
-### Watchdog settings
-###
-# Enables watchdog to kill backends that are inactive for too much time
-# WATCHDOG_IDLE=true
-#
-# Enables watchdog to kill backends that are busy for too much time
-# WATCHDOG_BUSY=true
-#
-# Time in duration format (e.g. 1h30m) after which a backend is considered idle
-# WATCHDOG_IDLE_TIMEOUT=5m
-#
-# Time in duration format (e.g. 1h30m) after which a backend is considered busy
-# WATCHDOG_BUSY_TIMEOUT=5m
+# LLAMACPP_PARALLEL=1
--- a/.github/bump_docs.sh
+++ b/.github/bump_docs.sh
@@ -1,7 +0,0 @@
-#!/bin/bash
-set -xe
-REPO=$1
-
-LATEST_TAG=$(curl -s "https://api.github.com/repos/$REPO/releases/latest" | jq -r '.name')
-
-cat <<< $(jq ".version = \"$LATEST_TAG\"" docs/data/version.json) > docs/data/version.json
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -1,31 +0,0 @@
-name: Bump dependencies
-on:
-  schedule:
-    - cron: 0 20 * * *
-  workflow_dispatch:
-jobs:
-  bump:
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: "mudler/LocalAI"
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Bump dependencies 🔧
-        run: |
-          bash .github/bump_docs.sh ${{ matrix.repository }}
-      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
-        with:
-          token: ${{ secrets.UPDATE_BOT_TOKEN }}
-          push-to-fork: ci-forks/LocalAI
-          commit-message: ':arrow_up: Update docs version ${{ matrix.repository }}'
-          title: ':arrow_up: Update docs version ${{ matrix.repository }}'
-          branch: "update/docs"
-          body: Bump of ${{ matrix.repository }} version inside docs
-          signoff: true
-
-
-
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -1,86 +0,0 @@
---
-name: 'build container images tests'
-
-on:
-  pull_request:
-
-concurrency:
-  group: ci-${{ github.head_ref || github.ref }}-${{ github.repository }}
-  cancel-in-progress: true
-
-jobs:
-  extras-image-build:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      # Pushing with all jobs in parallel
-      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
-      matrix:
-        include:
-          - build-type: ''
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "1"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-  core-image-build:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      matrix:
-        include:
-          - build-type: ''
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "1"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -2,6 +2,7 @@
 name: 'build container images'

 on:
+  pull_request:
  push:
    branches:
      - master
@@ -13,27 +14,8 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  extras-image-build:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+  image-build:
    strategy:
-      # Pushing with all jobs in parallel
-      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
      matrix:
        include:
          - build-type: ''
@@ -42,119 +24,130 @@ jobs:
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
+            cuda-major-version: 11
+            cuda-minor-version: 7
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11'
            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "1"
+            cuda-major-version: 12
+            cuda-minor-version: 1
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
+            cuda-major-version: 11
+            cuda-minor-version: 7
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11-ffmpeg'
            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "1"
+            cuda-major-version: 12
+            cuda-minor-version: 1
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-          - build-type: ''
-            #platforms: 'linux/amd64,linux/arm64'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: ''
-            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-  core-image-build:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      matrix:
-        include:
-          - build-type: ''
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
-          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-core'
-            ffmpeg: ''
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "1"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-core'
-            ffmpeg: ''
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
-          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "1"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
+
+    runs-on: arc-runner-set 
+    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - name: Checkout
+        uses: actions/checkout@v4
+      # - name: Release space from worker
+      #   run: |
+      #     echo "Listing top largest packages"
+      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+      #     head -n 30 <<< "${pkgs}"
+      #     echo
+      #     df -h
+      #     echo
+      #     sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+      #     sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+      #     sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+      #     sudo rm -rf /usr/local/lib/android
+      #     sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+      #     sudo rm -rf /usr/share/dotnet
+      #     sudo apt-get remove -y '^mono-.*' || true
+      #     sudo apt-get remove -y '^ghc-.*' || true
+      #     sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+      #     sudo apt-get remove -y 'php.*' || true
+      #     sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+      #     sudo apt-get remove -y '^google-.*' || true
+      #     sudo apt-get remove -y azure-cli || true
+      #     sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+      #     sudo apt-get remove -y '^gfortran-.*' || true
+      #     sudo apt-get remove -y microsoft-edge-stable || true
+      #     sudo apt-get remove -y firefox || true
+      #     sudo apt-get remove -y powershell || true
+      #     sudo apt-get remove -y r-base-core || true
+      #     sudo apt-get autoremove -y
+      #     sudo apt-get clean
+      #     echo
+      #     echo "Listing top largest packages"
+      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+      #     head -n 30 <<< "${pkgs}"
+      #     echo
+      #     sudo rm -rfv build || true
+      #     df -h
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: quay.io/go-skynet/local-ai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=${{ matrix.tag-latest }}
+            suffix=${{ matrix.tag-suffix }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BUILD_TYPE=${{ matrix.build-type }}
+            CUDA_MAJOR_VERSION=${{ matrix.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ matrix.cuda-minor-version }}
+            FFMPEG=${{ matrix.ffmpeg }}
+          context: .
+          file: ./Dockerfile
+          platforms: ${{ matrix.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -1,160 +0,0 @@
---
-name: 'build container images (reusable)'
-
-on:
-  workflow_call:
-    inputs:
-      build-type:
-        description: 'Build type'
-        default: ''
-        type: string
-      cuda-major-version:
-        description: 'CUDA major version'
-        default: "11"
-        type: string
-      cuda-minor-version:
-        description: 'CUDA minor version'
-        default: "7"
-        type: string
-      platforms:
-        description: 'Platforms'
-        default: ''
-        type: string
-      tag-latest:
-        description: 'Tag latest'
-        default: ''
-        type: string
-      tag-suffix:
-        description: 'Tag suffix'
-        default: ''
-        type: string
-      ffmpeg:
-        description: 'FFMPEG'
-        default: ''
-        type: string
-      image-type:
-        description: 'Image type'
-        default: ''
-        type: string
-      runs-on:
-        description: 'Runs on'
-        required: true
-        default: ''
-        type: string
-    secrets:
-      dockerUsername:
-        required: true
-      dockerPassword:
-        required: true
-      quayUsername:
-        required: true
-      quayPassword:
-        required: true
-jobs:
-  reusable_image-build:
-    runs-on: ${{ inputs.runs-on }}
-    steps:
-      - name: Force Install GIT latest
-        run: |
-          sudo apt-get update \
-          && sudo apt-get install -y software-properties-common \
-          && sudo apt-get update \
-          && sudo add-apt-repository -y ppa:git-core/ppa \
-          && sudo apt-get update \
-          && sudo apt-get install -y git
-      - name: Checkout
-        uses: actions/checkout@v4
-      # - name: Release space from worker
-      #   run: |
-      #     echo "Listing top largest packages"
-      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-      #     head -n 30 <<< "${pkgs}"
-      #     echo
-      #     df -h
-      #     echo
-      #     sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-      #     sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-      #     sudo apt-get purge --auto-remove android-sdk-platform-tools || true
-      #     sudo rm -rf /usr/local/lib/android
-      #     sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
-      #     sudo rm -rf /usr/share/dotnet
-      #     sudo apt-get remove -y '^mono-.*' || true
-      #     sudo apt-get remove -y '^ghc-.*' || true
-      #     sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
-      #     sudo apt-get remove -y 'php.*' || true
-      #     sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
-      #     sudo apt-get remove -y '^google-.*' || true
-      #     sudo apt-get remove -y azure-cli || true
-      #     sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
-      #     sudo apt-get remove -y '^gfortran-.*' || true
-      #     sudo apt-get remove -y microsoft-edge-stable || true
-      #     sudo apt-get remove -y firefox || true
-      #     sudo apt-get remove -y powershell || true
-      #     sudo apt-get remove -y r-base-core || true
-      #     sudo apt-get autoremove -y
-      #     sudo apt-get clean
-      #     echo
-      #     echo "Listing top largest packages"
-      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-      #     head -n 30 <<< "${pkgs}"
-      #     echo
-      #     sudo rm -rfv build || true
-      #     df -h
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            quay.io/go-skynet/local-ai
-            localai/localai
-          tags: |
-            type=ref,event=branch
-            type=semver,pattern={{raw}}
-            type=sha
-          flavor: |
-            latest=${{ inputs.tag-latest }}
-            suffix=${{ inputs.tag-suffix }}
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@master
-        with:
-          platforms: all
-
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@master
-
-      - name: Login to DockerHub
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.dockerUsername }}
-          password: ${{ secrets.dockerPassword }}
-
-      - name: Login to DockerHub
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v3
-        with:
-          registry: quay.io
-          username: ${{ secrets.quayUsername }}
-          password: ${{ secrets.quayPassword }}
-
-      - name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          builder: ${{ steps.buildx.outputs.name }}
-          build-args: |
-            BUILD_TYPE=${{ inputs.build-type }}
-            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
-            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
-            FFMPEG=${{ inputs.ffmpeg }}
-            IMAGE_TYPE=${{ inputs.image-type }}
-          context: .
-          file: ./Dockerfile
-          platforms: ${{ inputs.platforms }}
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-      - name: job summary
-        run: |
-          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -5,10 +5,6 @@ on: push
 permissions:
  contents: write

-concurrency:
-  group: ci-releases-${{ github.head_ref || github.ref }}-${{ github.repository }}
-  cancel-in-progress: true
-
 jobs:
  build-linux:
    strategy:
@@ -34,22 +30,10 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg

-      - name: Cache grpc
-        id: cache-grpc
-        uses: actions/cache@v3
-        with:
-          path: grpc
-          key: ${{ runner.os }}-grpc
-      - name: Build grpc
-        if: steps.cache-grpc.outputs.cache-hit != 'true'
-        run: |
          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-            -DgRPC_BUILD_TESTS=OFF \
-            ../.. && sudo make -j12
-      - name: Install gRPC
-        run: |
-          cd grpc && cd cmake/build && sudo make -j12 install
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && sudo make -j12 install

      - name: Build
        id: build
@@ -90,7 +74,10 @@ jobs:
          go-version: '>=1.21.0'
      - name: Dependencies
        run: |
-          brew install protobuf grpc
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && make -j12 install && rm -rf grpc
      - name: Build
        id: build
        env:
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -1,277 +0,0 @@
---
-name: 'Tests extras backends'
-
-on:
-  pull_request:
-  push:
-    branches:
-      - master
-    tags:
-      - '*'
-
-concurrency:
-  group: ci-tests-extra-${{ github.head_ref || github.ref }}-${{ github.repository }}
-  cancel-in-progress: true
-
-jobs:
-  tests-transformers:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
-
-      - name: Test transformers
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/transformers
-           make -C backend/python/transformers test
-
-  tests-sentencetransformers:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
-
-      - name: Test sentencetransformers
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/sentencetransformers
-           make -C backend/python/sentencetransformers test
-
-  tests-diffusers:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
-
-      - name: Test diffusers
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/diffusers
-           make -C backend/python/diffusers test
-
-
-  tests-transformers-musicgen:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
-
-      - name: Test transformers-musicgen
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/transformers-musicgen
-           make -C backend/python/transformers-musicgen test
-
-
-
-  tests-petals:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
-
-      - name: Test petals
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/petals
-           make -C backend/python/petals test
-
-           
-
-  tests-bark:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
-
-      - name: Test bark
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/bark
-           make -C backend/python/bark test
-
-           
-  # Below tests needs GPU. Commented out for now
-  # TODO: Re-enable as soon as we have GPU nodes
-  # tests-vllm:
-  #   runs-on: ubuntu-latest
-  #   steps:
-  #     - name: Clone
-  #       uses: actions/checkout@v4
-  #       with: 
-  #         submodules: true
-  #     - name: Dependencies
-  #       run: |
-  #         sudo apt-get update
-  #         sudo apt-get install build-essential ffmpeg
-  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-  #            sudo apt-get update && \
-  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
-  #         sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-  #         sudo rm -rfv /usr/bin/conda || true
-  #     - name: Test vllm
-  #       run: |
-  #          export PATH=$PATH:/opt/conda/bin
-  #          make -C backend/python/vllm
-  #          make -C backend/python/vllm test
-  tests-vallex:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2    
-          sudo rm -rfv /usr/bin/conda || true
-      - name: Test vall-e-x
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/vall-e-x
-           make -C backend/python/vall-e-x test
-
-  tests-coqui:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng          
-          sudo rm -rfv /usr/bin/conda || true
-
-      - name: Test coqui
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/coqui
-           make -C backend/python/coqui test
--- a/.github/workflows/disabled/test-gpu.yml
+++ b/.github/workflows/disabled/test-gpu.yml
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -78,30 +78,20 @@ jobs:
          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
          
          sudo rm -rfv /usr/bin/conda || true
-          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
+          PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface

          # Pre-build piper before we start tests in order to have shared libraries in place
-          make sources/go-piper && \
-          GO_TAGS="tts" make -C sources/go-piper piper.o && \
-          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
+          make go-piper && \
+          GO_TAGS="tts" make -C go-piper piper.o && \
+          sudo cp -rfv go-piper/piper/build/pi/lib/. /usr/lib/ && \
+
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
-      - name: Cache grpc
-        id: cache-grpc
-        uses: actions/cache@v3
-        with:
-          path: grpc
-          key: ${{ runner.os }}-grpc
-      - name: Build grpc
-        if: steps.cache-grpc.outputs.cache-hit != 'true'
-        run: |
+
          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-            -DgRPC_BUILD_TESTS=OFF \
-            ../.. && sudo make -j12
-      - name: Install gRPC
-        run: |
-          cd grpc && cd cmake/build && sudo make -j12 install
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && sudo make -j12 install
      - name: Test
        run: |
          GO_TAGS="stablediffusion tts" make test
@@ -125,7 +115,10 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && make -j12 install && rm -rf grpc
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,15 @@
 # go-llama build artifacts
-/sources/
+go-llama
+go-llama-stable
+/gpt4all
+go-stable-diffusion
+go-piper
+/go-bert
+go-ggllm
+/piper
 __pycache__/
 *.a
 get-sources
-prepare-sources
 /backend/cpp/llama/grpc-server
 /backend/cpp/llama/llama.cpp

--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "docs/themes/hugo-theme-relearn"]
-	path = docs/themes/hugo-theme-relearn
-	url = https://github.com/McShelby/hugo-theme-relearn.git
--- a/88
+++ b/88
@@ -12,10 +12,9 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
-
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
-
-ARG GO_TAGS="stablediffusion tinydream tts"
+ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/run.sh,autogptq:/build/extra/grpc/autogptq/run.sh,bark:/build/extra/grpc/bark/run.sh,diffusers:/build/extra/grpc/diffusers/run.sh,exllama:/build/extra/grpc/exllama/run.sh,vall-e-x:/build/extra/grpc/vall-e-x/run.sh,vllm:/build/extra/grpc/vllm/run.sh"
+ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
+ARG GO_TAGS="stablediffusion tts"

 RUN apt-get update && \
    apt-get install -y ca-certificates curl patch pip cmake && apt-get clean
@@ -63,12 +62,25 @@ RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmo
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
    apt-get update && \
-    apt-get install -y conda && apt-get clean
+    apt-get install -y conda

+COPY extra/requirements.txt /build/extra/requirements.txt
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN pip install --upgrade pip
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN apt-get install -y espeak-ng espeak && apt-get clean
+#RUN if [ "${TARGETARCH}" = "amd64" ]; then \
+#        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
+#    fi
+#RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
+#        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
+ #   fi
+#RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+
+# Vall-e-X
+RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt
+
+# \
+#    ; fi

 ###################################
 ###################################
@@ -86,9 +98,12 @@ ENV NVIDIA_VISIBLE_DEVICES=all

 WORKDIR /build

+COPY Makefile .
+RUN make get-sources
+COPY go.mod .
+RUN make prepare
 COPY . .
 COPY .git .
-RUN make prepare

 # stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
@@ -97,15 +112,15 @@ RUN if [ "${BUILD_GRPC}" = "true" ]; then \
    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
      -DgRPC_BUILD_TESTS=OFF \
-       ../.. && make -j12 install \
+       ../.. && make -j12 install && rm -rf grpc \
    ; fi

 # Rebuild with defaults backends
 RUN make build

-RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
-    mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
-    touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
+RUN if [ ! -d "/build/go-piper/piper/build/pi/lib/" ]; then \
+    mkdir -p /build/go-piper/piper/build/pi/lib/ \
+    touch /build/go-piper/piper/build/pi/lib/keep \
    ; fi

 ###################################
@@ -126,11 +141,10 @@ ARG CUDA_MAJOR_VERSION=11
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
-ENV PIP_CACHE_PURGE=true

 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
-    apt-get install -y ffmpeg && apt-get clean \
+    apt-get install -y ffmpeg \
    ; fi

 WORKDIR /build
@@ -140,61 +154,49 @@ WORKDIR /build
 # see https://github.com/go-skynet/LocalAI/pull/658#discussion_r1241971626 and
 # https://github.com/go-skynet/LocalAI/pull/434
 COPY . .
-
-COPY --from=builder /build/sources ./sources/
-COPY --from=builder /build/grpc ./grpc/
-
-RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf grpc
+RUN make prepare-sources

 # Copy the binary
 COPY --from=builder /build/local-ai ./

 # Copy shared libraries for piper
-COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
+COPY --from=builder /build/go-piper/piper/build/pi/lib/* /usr/lib/

 # do not let stablediffusion rebuild (requires an older version of absl)
 COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion

 ## Duplicated from Makefile to avoid having a big layer that's hard to push
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/autogptq \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/autogptq \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/bark \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/bark \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/diffusers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/vllm \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vllm \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vall-e-x \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/vall-e-x \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    PATH=$PATH:/opt/conda/bin make -C backend/python/exllama2 \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers-musicgen \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/coqui \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/exllama \
    ; fi

-# Make sure the models directory exists
-RUN mkdir -p /build/models
+# Copy VALLE-X as it's not a real "lib"
+RUN if [ -d /usr/lib/vall-e-x ]; then \
+    cp -rfv /usr/lib/vall-e-x/* ./ ; \ 
+    fi
+
+# we also copy exllama libs over to resolve exllama import error
+RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
+        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
+    fi

 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
--- a/Entitlements.plist
+++ b/Entitlements.plist
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-    <key>com.apple.security.network.client</key>
-    <true/>
-    <key>com.apple.security.network.server</key>
-    <true/>
-</dict>
-</plist>
--- a/380
+++ b/380
@@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

-CPPLLAMA_VERSION?=1fc2f265ff9377a37fd2c61eae9cd813a3491bea
+CPPLLAMA_VERSION?=a75fa576abba9d37f463580c379e4bbf1e1ad03c

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -19,27 +19,23 @@ GOGGMLTRANSFORMERS_VERSION?=ffb09d7dd71e2cbc6c5d7d05357d230eea6f369a

 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
-RWKV_VERSION?=633c5a3485c403cb2520693dc0991a25dace9f0f
+RWKV_VERSION?=c898cd0f62df8f2a7830e53d1d513bef4f6f792b

 # whisper.cpp version
-WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346
+WHISPER_CPP_VERSION?=85ed71aaec8e0612a84c0b67804bde75aa75a273

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d

 # go-piper version
-PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07
+PIPER_VERSION?=736f6fb639ab8e3397356e48eeb6bdcb9da88a78

 # stablediffusion version
-STABLEDIFFUSION_VERSION?=902db5f066fd137697e3b69d0fa10d4782bd2c2f
-
-# tinydream version
-TINYDREAM_VERSION?=772a9c0d9aaf768290e63cca3c904fe69faf677a
+STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632

 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=
-
 CGO_LDFLAGS?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
@@ -72,39 +68,29 @@ ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif

-ifeq ($(OS),Darwin)
+ifeq ($(UNAME_S),Darwin)
 	CGO_LDFLAGS += -lcblas -framework Accelerate
-	ifeq ($(OSX_SIGNING_IDENTITY),)
-		OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
-	endif
-
-	# on OSX, if BUILD_TYPE is blank, we should default to use Metal
-	ifeq ($(BUILD_TYPE),)
-		BUILD_TYPE=metal
-	# disable metal if on Darwin and any other value is explicitly passed.
-	else ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DLLAMA_METAL=OFF
-	endif
+ifneq ($(BUILD_TYPE),metal)
+    # explicit disable metal if on Darwin and metal is disabled
+	CMAKE_ARGS+=-DLLAMA_METAL=OFF
+endif
 endif

 ifeq ($(BUILD_TYPE),openblas)
 	CGO_LDFLAGS+=-lopenblas
-	export WHISPER_OPENBLAS=1
 endif

 ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
-	export WHISPER_CUBLAS=1
 endif

 ifeq ($(BUILD_TYPE),hipblas)
 	ROCM_HOME ?= /opt/rocm
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
-	# llama-ggml has no hipblas support, so override it here.
+	# Llama-stable has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
-	export WHISPER_HIPBLAS=1
 	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
@@ -114,12 +100,10 @@ endif
 ifeq ($(BUILD_TYPE),metal)
 	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	export LLAMA_METAL=1
-	export WHISPER_METAL=1
 endif

 ifeq ($(BUILD_TYPE),clblas)
 	CGO_LDFLAGS+=-lOpenCL -lclblast
-	export WHISPER_CLBLAST=1
 endif

 # glibc-static or glibc-devel-static required
@@ -132,20 +116,15 @@ ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
 	OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
 endif

-ifeq ($(findstring tinydream,$(GO_TAGS)),tinydream)
-#	OPTIONAL_TARGETS+=go-tiny-dream/libtinydream.a
-	OPTIONAL_GRPC+=backend-assets/grpc/tinydream
-endif
-
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
-	PIPER_CGO_CXXFLAGS+=-I$(shell pwd)/sources/go-piper/piper/src/cpp -I$(shell pwd)/sources/go-piper/piper/build/fi/include -I$(shell pwd)/sources/go-piper/piper/build/pi/include -I$(shell pwd)/sources/go-piper/piper/build/si/include
- 	PIPER_CGO_LDFLAGS+=-L$(shell pwd)/sources/go-piper/piper/build/fi/lib -L$(shell pwd)/sources/go-piper/piper/build/pi/lib -L$(shell pwd)/sources/go-piper/piper/build/si/lib -lfmt -lspdlog -lucd
+	PIPER_CGO_CXXFLAGS+=-I$(shell pwd)/go-piper/piper/src/cpp -I$(shell pwd)/go-piper/piper/build/fi/include -I$(shell pwd)/go-piper/piper/build/pi/include -I$(shell pwd)/go-piper/piper/build/si/include
+ 	PIPER_CGO_LDFLAGS+=-L$(shell pwd)/go-piper/piper/build/fi/lib -L$(shell pwd)/go-piper/piper/build/pi/lib -L$(shell pwd)/go-piper/piper/build/si/lib -lfmt -lspdlog
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-ggml backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
 GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)

 # If empty, then we build all
@@ -158,127 +137,112 @@ endif
 all: help

 ## GPT4ALL
-sources/gpt4all:
-	git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
-	cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
+gpt4all:
+	git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
+	cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1

 ## go-piper
-sources/go-piper:
-	git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
-	cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
+go-piper:
+	git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
+	cd go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1

 ## BERT embeddings
-sources/go-bert:
-	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
-	cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
+go-bert:
+	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp go-bert
+	cd go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1

 ## stable diffusion
-sources/go-stable-diffusion:
-	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
-	cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
+go-stable-diffusion:
+	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion go-stable-diffusion
+	cd go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-stable-diffusion/libstablediffusion.a:
-	$(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
-
-## tiny-dream
-sources/go-tiny-dream:
-	git clone --recurse-submodules https://github.com/M0Rf30/go-tiny-dream sources/go-tiny-dream
-	cd sources/go-tiny-dream && git checkout -b build $(TINYDREAM_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/go-tiny-dream/libtinydream.a:
-	$(MAKE) -C sources/go-tiny-dream libtinydream.a
+go-stable-diffusion/libstablediffusion.a:
+	$(MAKE) -C go-stable-diffusion libstablediffusion.a

 ## RWKV
-sources/go-rwkv:
-	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv
-	cd sources/go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+go-rwkv:
+	git clone --recurse-submodules $(RWKV_REPO) go-rwkv
+	cd go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-rwkv/librwkv.a: sources/go-rwkv
-	cd sources/go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
+go-rwkv/librwkv.a: go-rwkv
+	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

-sources/go-bert/libgobert.a: sources/go-bert
-	$(MAKE) -C sources/go-bert libgobert.a
+go-bert/libgobert.a: go-bert
+	$(MAKE) -C go-bert libgobert.a

-backend-assets/gpt4all: sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
+backend-assets/gpt4all: gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	mkdir -p backend-assets/gpt4all
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
+	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
+	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
+	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true

-backend-assets/espeak-ng-data: sources/go-piper
+backend-assets/espeak-ng-data: go-piper
 	mkdir -p backend-assets/espeak-ng-data
-	$(MAKE) -C sources/go-piper piper.o
-	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
+	$(MAKE) -C go-piper piper.o
+	@cp -rf go-piper/piper/build/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data

-sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
-	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a
+gpt4all/gpt4all-bindings/golang/libgpt4all.a: gpt4all
+	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## CEREBRAS GPT
-sources/go-ggml-transformers:
-	git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp sources/go-ggml-transformers
-	cd sources/go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
+go-ggml-transformers:
+	git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp go-ggml-transformers
+	cd go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-ggml-transformers/libtransformers.a: sources/go-ggml-transformers
-	$(MAKE) -C sources/go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a
+go-ggml-transformers/libtransformers.a: go-ggml-transformers
+	$(MAKE) -C go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a

-sources/whisper.cpp:
-	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
-	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
+whisper.cpp:
+	git clone https://github.com/ggerganov/whisper.cpp.git
+	cd whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1

-sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
-	cd sources/whisper.cpp && make libwhisper.a
+whisper.cpp/libwhisper.a: whisper.cpp
+	cd whisper.cpp && make libwhisper.a

-sources/go-llama:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama
-	cd sources/go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1
+go-llama:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
+	cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-llama-ggml:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
-	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
+go-llama-stable:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-stable
+	cd go-llama-stable && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-llama/libbinding.a: sources/go-llama
-	$(MAKE) -C sources/go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
+go-llama/libbinding.a: go-llama
+	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a

-sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
-	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
+go-llama-stable/libbinding.a: go-llama-stable
+	$(MAKE) -C go-llama-stable BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

-sources/go-piper/libpiper_binding.a: sources/go-piper
-	$(MAKE) -C sources/go-piper libpiper_binding.a example/main
+go-piper/libpiper_binding.a: go-piper
+	$(MAKE) -C go-piper libpiper_binding.a example/main

-backend/cpp/llama/llama.cpp:
-	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp	
-
-get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/go-ggml-transformers sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
+get-sources: go-llama go-llama-stable go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert go-stable-diffusion
 	touch $@

 replace:
-	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/sources/go-ggml-transformers
-	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/sources/go-rwkv
-	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/sources/whisper.cpp
-	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(shell pwd)/sources/whisper.cpp/bindings/go
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/sources/go-bert
-	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/sources/go-stable-diffusion
-	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(shell pwd)/sources/go-tiny-dream
-	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/sources/go-piper
+	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
+	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
+	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
+	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
-	touch $@

 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C sources/go-llama clean
-	$(MAKE) -C sources/go-llama-ggml clean
-	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
-	$(MAKE) -C sources/go-ggml-transformers clean
-	$(MAKE) -C sources/go-rwkv clean
-	$(MAKE) -C sources/whisper.cpp clean
-	$(MAKE) -C sources/go-stable-diffusion clean
-	$(MAKE) -C sources/go-bert clean
-	$(MAKE) -C sources/go-piper clean
-	$(MAKE) -C sources/go-tiny-dream clean
+	$(MAKE) -C go-llama clean
+	$(MAKE) -C go-llama-stable clean
+	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
+	$(MAKE) -C go-ggml-transformers clean
+	$(MAKE) -C go-rwkv clean
+	$(MAKE) -C whisper.cpp clean
+	$(MAKE) -C go-stable-diffusion clean
+	$(MAKE) -C go-bert clean
+	$(MAKE) -C go-piper clean
 	$(MAKE) build

 prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -287,7 +251,17 @@ prepare: prepare-sources $(OPTIONAL_TARGETS)
 clean: ## Remove build related file
 	$(GOCMD) clean -cache
 	rm -f prepare
-	rm -rf ./sources
+	rm -rf ./go-llama
+	rm -rf ./gpt4all
+	rm -rf ./go-llama-stable
+	rm -rf ./go-gpt2
+	rm -rf ./go-stable-diffusion
+	rm -rf ./go-ggml-transformers
+	rm -rf ./backend-assets
+	rm -rf ./go-rwkv
+	rm -rf ./go-bert
+	rm -rf ./whisper.cpp
+	rm -rf ./go-piper
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
 	rm -rf ./backend/cpp/grpc/grpc_repo
@@ -309,9 +283,6 @@ dist: build
 	mkdir -p release
 	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)

-osx-signed: build
-	codesign --deep --force --sign "$(OSX_SIGNING_IDENTITY)" --entitlements "./Entitlements.plist" "./$(BINARY_NAME)"
-
 ## Run
 run: prepare ## run local-ai
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./
@@ -335,7 +306,7 @@ test: prepare test-models/testmodel grpcs
 	@echo 'Running tests'
 	export GO_TAGS="tts stablediffusion"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts 5 --fail-fast -v -r ./api ./pkg
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
@@ -402,57 +373,40 @@ help: ## Show this help.
 protogen: protogen-go protogen-python

 protogen-go:
-	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
-    backend/backend.proto
+	protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative \
+    pkg/grpc/proto/backend.proto

 protogen-python:
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/sentencetransformers/ --grpc_python_out=backend/python/sentencetransformers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers/ --grpc_python_out=backend/python/transformers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers-musicgen/ --grpc_python_out=backend/python/transformers-musicgen/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/autogptq/ --grpc_python_out=backend/python/autogptq/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama/ --grpc_python_out=backend/python/exllama/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/bark/ --grpc_python_out=backend/python/bark/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/diffusers/ --grpc_python_out=backend/python/diffusers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/coqui/ --grpc_python_out=backend/python/coqui/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama2/ --grpc_python_out=backend/python/exllama2/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/huggingface/ --grpc_python_out=extra/grpc/huggingface/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/autogptq/ --grpc_python_out=extra/grpc/autogptq/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/exllama/ --grpc_python_out=extra/grpc/exllama/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vall-e-x/ --grpc_python_out=extra/grpc/vall-e-x/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vllm/ --grpc_python_out=extra/grpc/vllm/ pkg/grpc/proto/backend.proto

 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments:
-	$(MAKE) -C backend/python/autogptq
-	$(MAKE) -C backend/python/bark
-	$(MAKE) -C backend/python/coqui
-	$(MAKE) -C backend/python/diffusers
-	$(MAKE) -C backend/python/vllm
-	$(MAKE) -C backend/python/sentencetransformers
-	$(MAKE) -C backend/python/transformers
-	$(MAKE) -C backend/python/transformers-musicgen
-	$(MAKE) -C backend/python/vall-e-x
-	$(MAKE) -C backend/python/exllama
-	$(MAKE) -C backend/python/petals
-	$(MAKE) -C backend/python/exllama2
+	$(MAKE) -C extra/grpc/autogptq
+	$(MAKE) -C extra/grpc/bark
+	$(MAKE) -C extra/grpc/diffusers
+	$(MAKE) -C extra/grpc/vllm
+	$(MAKE) -C extra/grpc/huggingface
+	$(MAKE) -C extra/grpc/vall-e-x
+	$(MAKE) -C extra/grpc/exllama

-prepare-test-extra:
-	$(MAKE) -C backend/python/transformers
-	$(MAKE) -C backend/python/diffusers
-
-test-extra: prepare-test-extra
-	$(MAKE) -C backend/python/transformers test
-	$(MAKE) -C backend/python/diffusers test

 backend-assets/grpc:
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/sources/go-llama
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-llama LIBRARY_PATH=$(shell pwd)/sources/go-llama \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
-# TODO: every binary should have its own folder instead, so can have different  implementations
+backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/
+# TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
+	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
 endif

 ## BACKEND CPP LLAMA START
@@ -471,7 +425,7 @@ ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
 	export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
 	export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
 	export PATH=${PATH}:${INSTALLED_PACKAGES}/bin && \
-	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
+	CMAKE_ARGS="${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server 
 else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server			
@@ -486,75 +440,71 @@ ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
 endif

-backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/sources/go-llama-ggml
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-llama-ggml LIBRARY_PATH=$(shell pwd)/sources/go-llama-ggml \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
+backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-stable ./cmd/grpc/llama-stable/

-backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
+backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/

-backend-assets/grpc/dolly: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./backend/go/llm/dolly/
+backend-assets/grpc/dolly: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./cmd/grpc/dolly/

-backend-assets/grpc/gpt2: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt2 ./backend/go/llm/gpt2/
+backend-assets/grpc/gpt2: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt2 ./cmd/grpc/gpt2/

-backend-assets/grpc/gptj: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./backend/go/llm/gptj/
+backend-assets/grpc/gptj: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./cmd/grpc/gptj/

-backend-assets/grpc/gptneox: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./backend/go/llm/gptneox/
+backend-assets/grpc/gptneox: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./cmd/grpc/gptneox/

-backend-assets/grpc/mpt: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./backend/go/llm/mpt/
+backend-assets/grpc/mpt: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./cmd/grpc/mpt/

-backend-assets/grpc/replit: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./backend/go/llm/replit/
+backend-assets/grpc/replit: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./cmd/grpc/replit/

-backend-assets/grpc/falcon-ggml: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon-ggml ./backend/go/llm/falcon-ggml/
+backend-assets/grpc/falcon-ggml: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon-ggml ./cmd/grpc/falcon-ggml/

-backend-assets/grpc/starcoder: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/starcoder ./backend/go/llm/starcoder/
+backend-assets/grpc/starcoder: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/starcoder ./cmd/grpc/starcoder/

-backend-assets/grpc/rwkv: backend-assets/grpc sources/go-rwkv/librwkv.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-rwkv LIBRARY_PATH=$(shell pwd)/sources/go-rwkv \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
+backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/

-backend-assets/grpc/bert-embeddings: backend-assets/grpc sources/go-bert/libgobert.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-bert LIBRARY_PATH=$(shell pwd)/sources/go-bert \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
+backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/

 backend-assets/grpc/langchain-huggingface: backend-assets/grpc
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./cmd/grpc/langchain-huggingface/

 backend-assets/grpc/stablediffusion: backend-assets/grpc
 	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
-		$(MAKE) sources/go-stable-diffusion/libstablediffusion.a; \
-		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/sources/go-stable-diffusion/ \
-		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion; \
+		$(MAKE) go-stable-diffusion/libstablediffusion.a; \
+		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
+		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/; \
 	fi

-backend-assets/grpc/tinydream: backend-assets/grpc sources/go-tiny-dream/libtinydream.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-tiny-dream \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
+backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data go-piper/libpiper_binding.a
+	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./cmd/grpc/piper/

-backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data sources/go-piper/libpiper_binding.a
-	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/sources/go-piper \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
-
-backend-assets/grpc/whisper: backend-assets/grpc sources/whisper.cpp/libwhisper.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/whisper.cpp LIBRARY_PATH=$(shell pwd)/sources/whisper.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
+backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/

 grpcs: prepare $(GRPC_BACKENDS)
--- a/README.md
+++ b/README.md
@@ -20,15 +20,17 @@
 </a>
 </p>

-[<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker">](https://hub.docker.com/r/localai/localai)
-[<img src="https://img.shields.io/badge/quay.io-images-important.svg?">](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest)
-
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
->
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+> 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/)
+

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

+**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format, pytorch and more. Does not require GPU.
+
+<p align="center"><b>Follow LocalAI </b></p>
+
 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
 <img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
@@ -37,26 +39,47 @@
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
 </a>

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.
+<p align="center"><b>Connect with the Creator </b></p>

-## 🔥🔥 Hot topics / Roadmap
+<p align="center">
+<a href="https://twitter.com/mudler_it" target="blank">
+<img src="https://img.shields.io/twitter/follow/mudler_it?label=Follow: mudler_it&style=social" alt="Follow mudler_it"/>
+</a>
+<a href='https://github.com/mudler'>
+<img alt="Follow on Github" src="https://img.shields.io/badge/Follow-mudler-black?logo=github&link=https%3A%2F%2Fgithub.com%2Fmudler">
+</a>
+</p>

-[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+<p align="center"><b>Share LocalAI Repository</b></p>

- Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
- 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
- Inline templates: https://github.com/mudler/LocalAI/pull/1452
- Mixtral: https://github.com/mudler/LocalAI/pull/1449
- Img2vid https://github.com/mudler/LocalAI/pull/1442
- Musicgen https://github.com/mudler/LocalAI/pull/1387
+<p align="center">

-Hot topics (looking for contributors):
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
+<a href="https://twitter.com/intent/tweet?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI&hashtags=LocalAI,AI" target="blank">
+<img src="https://img.shields.io/twitter/follow/_LocalAI?label=Share Repo on Twitter&style=social" alt="Follow _LocalAI"/></a> 
+<a href="https://t.me/share/url?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Telegram&logo=Telegram&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Telegram"/></a>
+<a href="https://api.whatsapp.com/send?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%20https://github.com/go-skynet/LocalAI"><img src="https://img.shields.io/twitter/url?label=whatsapp&logo=whatsapp&style=social&url=https://github.com/go-skynet/LocalAI" /></a> <a href="https://www.reddit.com/submit?url=https://github.com/go-skynet/LocalAI&title=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.
+" target="blank">
+<img src="https://img.shields.io/twitter/url?label=Reddit&logo=Reddit&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Reddit"/>
+</a> <a href="mailto:?subject=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%3A%0Ahttps://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Gmail&logo=Gmail&style=social&url=https://github.com/go-skynet/LocalAI"/></a> <a href="https://www.buymeacoffee.com/mudler" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="23" width="100" style="border-radius:1px"></a>

-If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
+</p>

-## 💻 [Getting started](https://localai.io/basics/getting_started/index.html)
+<hr>
+
+In a nutshell:
+
+- Local, OpenAI drop-in alternative REST API. You own your data.
+- NO GPU required. NO Internet access is required either
+  - Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html). 
+- Supports multiple models
+- 🏃 Once loaded the first time, it keep models loaded in memory for faster inference
+- ⚡ Doesn't shell-out, but uses C++ bindings for a faster inference and better performance.
+
+LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! 
+
+Note that this started just as a [fun weekend project](https://localai.io/#backstory) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!
+
+## 🔥🔥 [Hot topics / Roadmap](https://localai.io/#-hot-topics--roadmap)

 ## 🚀 [Features](https://localai.io/features/)

@@ -68,41 +91,7 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
 - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
- 🆕 [Vision API](https://localai.io/features/gpt-vision/)

-## 💻 Usage
-
-Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
-
-### 🔗 Community and integrations
-
-Build and deploy custom containers:
- https://github.com/sozercan/aikit
-
-WebUIs:
- https://github.com/Jirubizu/localai-admin
- https://github.com/go-skynet/LocalAI-frontend
-
-Model galleries
- https://github.com/go-skynet/model-gallery
-
-Other:
- Helm chart https://github.com/go-skynet/helm-charts
- VSCode extension https://github.com/badgooooor/localai-vscode-plugin
- Local Smart assistant https://github.com/mudler/LocalAGI
- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation
- Discord bot https://github.com/mudler/LocalAGI/tree/main/examples/discord
- Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
- Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
- Examples: https://github.com/mudler/LocalAI/tree/master/examples/
-
-### 🔗 Resources
-
- 🆕 New! [LLM finetuning guide](https://localai.io/advanced/fine-tuning/)
- [How to build locally](https://localai.io/basics/build/index.html)
- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
- [Projects integrating LocalAI](https://localai.io/integrations/)
- [How tos section](https://localai.io/howtos/) (curated by our community)

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

@@ -111,6 +100,21 @@ Other:
 - [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
 - [Tutorial to use k8sgpt with LocalAI](https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65)

+## 💻 Usage
+
+Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
+
+### 💡 Example: Use Luna-AI Llama model
+
+See the [documentation](https://localai.io/basics/getting_started)
+
+### 🔗 Resources
+
+- [How to build locally](https://localai.io/basics/build/index.html)
+- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
+- [Projects integrating LocalAI](https://localai.io/integrations/)
+- [How tos section](https://localai.io/howtos/) (curated by our community)
+  
 ## Citation

 If you utilize this repository, data in a downstream project, please consider citing it with:
@@ -133,12 +137,12 @@ Support the project by becoming [a backer or sponsor](https://github.com/sponsor

 A huge thank you to our generous sponsors who support this project:

-| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) |
+| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) | 
 |:-----------------------------------------------:|
-|  [Spectro Cloud](https://www.spectrocloud.com/)  |
+|  [Spectro Cloud](https://www.spectrocloud.com/)  |  
 |  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |

-And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project.
+And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project. 

 - [Sponsor list](https://github.com/sponsors/mudler)
 - JDAM00 (donating HW for the CI)
--- a/api/api.go
+++ b/api/api.go
@@ -1,10 +1,8 @@
 package api

 import (
-	"encoding/json"
 	"errors"
 	"fmt"
-	"os"
 	"strings"

 	config "github.com/go-skynet/LocalAI/api/config"
@@ -15,8 +13,6 @@ import (
 	"github.com/go-skynet/LocalAI/internal"
 	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/assets"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/pkg/startup"

 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
@@ -37,8 +33,6 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
 	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
 	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())

-	startup.PreloadModelsConfigurations(options.Loader.ModelPath, options.ModelsURL...)
-
 	cl := config.NewConfigLoader()
 	if err := cl.LoadConfigs(options.Loader.ModelPath); err != nil {
 		log.Error().Msgf("error loading config files: %s", err.Error())
@@ -50,22 +44,6 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
 		}
 	}

-	if err := cl.Preload(options.Loader.ModelPath); err != nil {
-		log.Error().Msgf("error downloading models: %s", err.Error())
-	}
-
-	if options.PreloadJSONModels != "" {
-		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
-			return nil, nil, err
-		}
-	}
-
-	if options.PreloadModelsFromPath != "" {
-		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
-			return nil, nil, err
-		}
-	}
-
 	if options.Debug {
 		for _, v := range cl.ListConfigs() {
 			cfg, _ := cl.GetConfig(v)
@@ -82,6 +60,18 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
 		}
 	}

+	if options.PreloadJSONModels != "" {
+		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	if options.PreloadModelsFromPath != "" {
+		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
 	// turn off any process that was started by GRPC if the context is canceled
 	go func() {
 		<-options.Context.Done()
@@ -89,22 +79,6 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
 		options.Loader.StopAllGRPC()
 	}()

-	if options.WatchDog {
-		wd := model.NewWatchDog(
-			options.Loader,
-			options.WatchDogBusyTimeout,
-			options.WatchDogIdleTimeout,
-			options.WatchDogBusy,
-			options.WatchDogIdle)
-		options.Loader.SetWatchDog(wd)
-		go wd.Run()
-		go func() {
-			<-options.Context.Done()
-			log.Debug().Msgf("Context canceled, shutting down")
-			wd.Shutdown()
-		}()
-	}
-
 	return options, cl, nil
 }

@@ -153,46 +127,28 @@ func App(opts ...options.AppOption) (*fiber.App, error) {

 	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
 	auth := func(c *fiber.Ctx) error {
-		if len(options.ApiKeys) == 0 {
-			return c.Next()
-		}
-
-		// Check for api_keys.json file
-		fileContent, err := os.ReadFile("api_keys.json")
-		if err == nil {
-			// Parse JSON content from the file
-			var fileKeys []string
-			err := json.Unmarshal(fileContent, &fileKeys)
-			if err != nil {
-				return c.Status(fiber.StatusInternalServerError).JSON(fiber.Map{"message": "Error parsing api_keys.json"})
+		if len(options.ApiKeys) > 0 {
+			authHeader := c.Get("Authorization")
+			if authHeader == "" {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
+			}
+			authHeaderParts := strings.Split(authHeader, " ")
+			if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
 			}

-			// Add file keys to options.ApiKeys
-			options.ApiKeys = append(options.ApiKeys, fileKeys...)
-		}
-
-		if len(options.ApiKeys) == 0 {
-			return c.Next()
-		}
-
-		authHeader := c.Get("Authorization")
-		if authHeader == "" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
-		}
-		authHeaderParts := strings.Split(authHeader, " ")
-		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
-		}
-
-		apiKey := authHeaderParts[1]
-		for _, key := range options.ApiKeys {
-			if apiKey == key {
-				return c.Next()
+			apiKey := authHeaderParts[1]
+			validApiKey := false
+			for _, key := range options.ApiKeys {
+				if apiKey == key {
+					validApiKey = true
+				}
+			}
+			if !validApiKey {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
 			}
 		}
-
-		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
-
+		return c.Next()
 	}

 	if options.CORS {
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -16,9 +16,9 @@ import (
 	. "github.com/go-skynet/LocalAI/api"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/metrics"
-	"github.com/go-skynet/LocalAI/pkg/downloader"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -61,7 +61,7 @@ func getModelStatus(url string) (response map[string]interface{}) {
 }

 func getModels(url string) (response []gallery.GalleryModel) {
-	downloader.GetURI(url, func(url string, i []byte) error {
+	utils.GetURI(url, func(url string, i []byte) error {
 		// Unmarshal YAML data into a struct
 		return json.Unmarshal(i, &response)
 	})
@@ -294,14 +294,14 @@ var _ = Describe("API test", func() {
 				Expect(content["backend"]).To(Equal("bert-embeddings"))
 			})

-			It("runs openllama(llama-ggml backend)", Label("llama"), func() {
+			It("runs openllama", Label("llama"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
-					Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
+					Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -362,10 +362,9 @@ var _ = Describe("API test", func() {
 				Expect(res["location"]).To(Equal("San Francisco, California, United States"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
-
 			})

-			It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
+			It("runs openllama gguf", Label("llama-gguf"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
@@ -705,7 +704,7 @@ var _ = Describe("API test", func() {
 		})

 		Context("External gRPC calls", func() {
-			It("calculate embeddings with sentencetransformers", func() {
+			It("calculate embeddings with huggingface", func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -16,7 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 		model.WithContext(o.Context),
 		model.WithModel(c.Model),
 		model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
-			CUDA:          c.CUDA || c.Diffusers.CUDA,
+			CUDA:          c.Diffusers.CUDA,
 			SchedulerType: c.Diffusers.SchedulerType,
 			PipelineType:  c.Diffusers.PipelineType,
 			CFGScale:      c.Diffusers.CFGScale,
@@ -27,7 +27,6 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 			CLIPModel:     c.Diffusers.ClipModel,
 			CLIPSubfolder: c.Diffusers.ClipSubFolder,
 			CLIPSkip:      int32(c.Diffusers.ClipSkip),
-			ControlNet:    c.Diffusers.ControlNet,
 		}),
 	})

--- a/api/backend/llm.go
+++ b/api/backend/llm.go
@@ -159,9 +159,6 @@ func Finetune(config config.Config, input, prediction string) string {
 	for _, c := range config.TrimSpace {
 		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
 	}
-
-	for _, c := range config.TrimSuffix {
-		prediction = strings.TrimSpace(strings.TrimSuffix(prediction, c))
-	}
 	return prediction
+
 }
--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -16,10 +16,6 @@ func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.
 		opts = append(opts, model.WithSingleActiveBackend())
 	}

-	if o.ParallelBackendRequests {
-		opts = append(opts, model.EnableParallelRequests)
-	}
-
 	if c.GRPC.Attempts != 0 {
 		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
 	}
@@ -46,7 +42,6 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 		Seed:           int32(c.Seed),
 		NBatch:         int32(b),
 		NoMulMatQ:      c.NoMulMatQ,
-		CUDA:           c.CUDA, // diffusers, transformers
 		DraftModel:     c.DraftModel,
 		AudioPath:      c.VallE.AudioPath,
 		Quantization:   c.Quantization,
--- a/api/backend/tts.go
+++ b/api/backend/tts.go
@@ -59,13 +59,9 @@ func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *opt
 	// If the model file is not empty, we pass it joined with the model path
 	modelPath := ""
 	if modelFile != "" {
-		if bb != model.TransformersMusicGen {
-			modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
-			if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
-				return "", nil, err
-			}
-		} else {
-			modelPath = modelFile
+		modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
+		if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
+			return "", nil, err
 		}
 	}

--- a/api/config/config.go
+++ b/api/config/config.go
@@ -1,7 +1,6 @@
 package api_config

 import (
-	"errors"
 	"fmt"
 	"io/fs"
 	"os"
@@ -9,9 +8,6 @@ import (
 	"strings"
 	"sync"

-	"github.com/go-skynet/LocalAI/pkg/downloader"
-	"github.com/go-skynet/LocalAI/pkg/utils"
-	"github.com/rs/zerolog/log"
 	"gopkg.in/yaml.v3"
 )

@@ -42,28 +38,14 @@ type Config struct {

 	// Diffusers
 	Diffusers Diffusers `yaml:"diffusers"`
-	Step      int       `yaml:"step"`
+
+	Step int `yaml:"step"`

 	// GRPC Options
 	GRPC GRPC `yaml:"grpc"`

 	// Vall-e-x
 	VallE VallE `yaml:"vall-e"`
-
-	// CUDA
-	// Explicitly enable CUDA or not (some backends might need it)
-	CUDA bool `yaml:"cuda"`
-
-	DownloadFiles []File `yaml:"download_files"`
-
-	Description string `yaml:"description"`
-	Usage       string `yaml:"usage"`
-}
-
-type File struct {
-	Filename string `yaml:"filename" json:"filename"`
-	SHA256   string `yaml:"sha256" json:"sha256"`
-	URI      string `yaml:"uri" json:"uri"`
 }

 type VallE struct {
@@ -83,16 +65,15 @@ type GRPC struct {
 }

 type Diffusers struct {
-	CUDA             bool    `yaml:"cuda"`
 	PipelineType     string  `yaml:"pipeline_type"`
 	SchedulerType    string  `yaml:"scheduler_type"`
+	CUDA             bool    `yaml:"cuda"`
 	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
 	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
 	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
 	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
 	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
 	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
-	ControlNet       string  `yaml:"control_net"`
 }

 type LLMConfig struct {
@@ -115,18 +96,16 @@ type LLMConfig struct {
 	StopWords       []string `yaml:"stopwords"`
 	Cutstrings      []string `yaml:"cutstrings"`
 	TrimSpace       []string `yaml:"trimspace"`
-	TrimSuffix      []string `yaml:"trimsuffix"`
-
-	ContextSize  int     `yaml:"context_size"`
-	NUMA         bool    `yaml:"numa"`
-	LoraAdapter  string  `yaml:"lora_adapter"`
-	LoraBase     string  `yaml:"lora_base"`
-	LoraScale    float32 `yaml:"lora_scale"`
-	NoMulMatQ    bool    `yaml:"no_mulmatq"`
-	DraftModel   string  `yaml:"draft_model"`
-	NDraft       int32   `yaml:"n_draft"`
-	Quantization string  `yaml:"quantization"`
-	MMProj       string  `yaml:"mmproj"`
+	ContextSize     int      `yaml:"context_size"`
+	NUMA            bool     `yaml:"numa"`
+	LoraAdapter     string   `yaml:"lora_adapter"`
+	LoraBase        string   `yaml:"lora_base"`
+	LoraScale       float32  `yaml:"lora_scale"`
+	NoMulMatQ       bool     `yaml:"no_mulmatq"`
+	DraftModel      string   `yaml:"draft_model"`
+	NDraft          int32    `yaml:"n_draft"`
+	Quantization    string   `yaml:"quantization"`
+	MMProj          string   `yaml:"mmproj"`

 	RopeScaling    string  `yaml:"rope_scaling"`
 	YarnExtFactor  float32 `yaml:"yarn_ext_factor"`
@@ -281,67 +260,6 @@ func (cm *ConfigLoader) ListConfigs() []string {
 	return res
 }

-// Preload prepare models if they are not local but url or huggingface repositories
-func (cm *ConfigLoader) Preload(modelPath string) error {
-	cm.Lock()
-	defer cm.Unlock()
-
-	status := func(fileName, current, total string, percent float64) {
-		utils.DisplayDownloadFunction(fileName, current, total, percent)
-	}
-
-	log.Info().Msgf("Preloading models from %s", modelPath)
-
-	for i, config := range cm.configs {
-
-		// Download files and verify their SHA
-		for _, file := range config.DownloadFiles {
-			log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename)
-
-			if err := utils.VerifyPath(file.Filename, modelPath); err != nil {
-				return err
-			}
-			// Create file path
-			filePath := filepath.Join(modelPath, file.Filename)
-
-			if err := downloader.DownloadFile(file.URI, filePath, file.SHA256, status); err != nil {
-				return err
-			}
-		}
-
-		modelURL := config.PredictionOptions.Model
-		modelURL = downloader.ConvertURL(modelURL)
-
-		if downloader.LooksLikeURL(modelURL) {
-			// md5 of model name
-			md5Name := utils.MD5(modelURL)
-
-			// check if file exists
-			if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
-				err := downloader.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", status)
-				if err != nil {
-					return err
-				}
-			}
-
-			cc := cm.configs[i]
-			c := &cc
-			c.PredictionOptions.Model = md5Name
-			cm.configs[i] = *c
-		}
-		if cm.configs[i].Name != "" {
-			log.Info().Msgf("Model name: %s", cm.configs[i].Name)
-		}
-		if cm.configs[i].Description != "" {
-			log.Info().Msgf("Model description: %s", cm.configs[i].Description)
-		}
-		if cm.configs[i].Usage != "" {
-			log.Info().Msgf("Model usage: \n%s", cm.configs[i].Usage)
-		}
-	}
-	return nil
-}
-
 func (cm *ConfigLoader) LoadConfigs(path string) error {
 	cm.Lock()
 	defer cm.Unlock()
@@ -359,7 +277,7 @@ func (cm *ConfigLoader) LoadConfigs(path string) error {
 	}
 	for _, file := range files {
 		// Skip templates, YAML and .keep files
-		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
+		if !strings.Contains(file.Name(), ".yaml") {
 			continue
 		}
 		c, err := ReadConfig(filepath.Join(path, file.Name()))
--- a/api/localai/backend_monitor.go
+++ b/api/localai/backend_monitor.go
@@ -123,12 +123,13 @@ func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
 			return err
 		}

-		model := bm.options.Loader.CheckIsLoaded(backendId)
-		if model == "" {
+		client := bm.options.Loader.CheckIsLoaded(backendId)
+
+		if client == nil {
 			return fmt.Errorf("backend %s is not currently loaded", backendId)
 		}

-		status, rpcErr := model.GRPC(false, nil).Status(context.TODO())
+		status, rpcErr := client.Status(context.TODO())
 		if rpcErr != nil {
 			log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
 			val, slbErr := bm.SampleLocalBackendProcess(backendId)
--- a/api/localai/gallery.go
+++ b/api/localai/gallery.go
@@ -130,12 +130,6 @@ func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
 					continue
 				}

-				err = cm.Preload(g.modelPath)
-				if err != nil {
-					updateError(err)
-					continue
-				}
-
 				g.updateStatus(op.id, &galleryOpStatus{Processed: true, Message: "completed", Progress: 100})
 			}
 		}
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -81,7 +81,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
 		}

-		if input.ResponseFormat.Type == "json_object" {
+		if input.ResponseFormat == "json_object" {
 			input.Grammar = grammar.JSONBNF
 		}

@@ -219,12 +219,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			c.Set("Transfer-Encoding", "chunked")
 		}

-		templateFile := ""
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if o.Loader.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-			templateFile = config.Model
-		}
+		templateFile := config.Model

 		if config.TemplateConfig.Chat != "" && !processFunctions {
 			templateFile = config.TemplateConfig.Chat
@@ -234,19 +229,18 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			templateFile = config.TemplateConfig.Functions
 		}

-		if templateFile != "" {
-			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
-				SystemPrompt:         config.SystemPrompt,
-				SuppressSystemPrompt: suppressConfigSystemPrompt,
-				Input:                predInput,
-				Functions:            funcs,
-			})
-			if err == nil {
-				predInput = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", predInput)
-			} else {
-				log.Debug().Msgf("Template failed loading: %s", err.Error())
-			}
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
+			SystemPrompt:         config.SystemPrompt,
+			SuppressSystemPrompt: suppressConfigSystemPrompt,
+			Input:                predInput,
+			Functions:            funcs,
+		})
+		if err == nil {
+			predInput = templatedInput
+			log.Debug().Msgf("Template found, input modified to: %s", predInput)
+		} else {
+			log.Debug().Msgf("Template failed loading: %s", err.Error())
 		}

 		log.Debug().Msgf("Prompt (after templating): %s", predInput)
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -65,7 +65,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		if input.ResponseFormat.Type == "json_object" {
+		if input.ResponseFormat == "json_object" {
 			input.Grammar = grammar.JSONBNF
 		}

@@ -81,12 +81,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			c.Set("Transfer-Encoding", "chunked")
 		}

-		templateFile := ""
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if o.Loader.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-			templateFile = config.Model
-		}
+		templateFile := config.Model

 		if config.TemplateConfig.Completion != "" {
 			templateFile = config.TemplateConfig.Completion
@@ -99,14 +94,13 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe

 			predInput := config.PromptStrings[0]

-			if templateFile != "" {
-				templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-					Input: predInput,
-				})
-				if err == nil {
-					predInput = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", predInput)
-				}
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+				Input: predInput,
+			})
+			if err == nil {
+				predInput = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", predInput)
 			}

 			responses := make(chan schema.OpenAIResponse)
@@ -151,16 +145,14 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 		totalTokenUsage := backend.TokenUsage{}

 		for k, i := range config.PromptStrings {
-			if templateFile != "" {
-				// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-				templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-					SystemPrompt: config.SystemPrompt,
-					Input:        i,
-				})
-				if err == nil {
-					i = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", i)
-				}
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+				SystemPrompt: config.SystemPrompt,
+				Input:        i,
+			})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}

 			r, tokenUsage, err := ComputeChoices(
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -30,12 +30,7 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)

 		log.Debug().Msgf("Parameter Config: %+v", config)

-		templateFile := ""
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if o.Loader.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-			templateFile = config.Model
-		}
+		templateFile := config.Model

 		if config.TemplateConfig.Edit != "" {
 			templateFile = config.TemplateConfig.Edit
@@ -45,16 +40,15 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		totalTokenUsage := backend.TokenUsage{}

 		for _, i := range config.InputStrings {
-			if templateFile != "" {
-				templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
-					Input:        i,
-					Instruction:  input.Instruction,
-					SystemPrompt: config.SystemPrompt,
-				})
-				if err == nil {
-					i = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", i)
-				}
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
+				Input:        i,
+				Instruction:  input.Instruction,
+				SystemPrompt: config.SystemPrompt,
+			})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}

 			r, tokenUsage, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -5,8 +5,6 @@ import (
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
-	"io"
-	"net/http"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -24,26 +22,6 @@ import (
 	"github.com/rs/zerolog/log"
 )

-func downloadFile(url string) (string, error) {
-	// Get the data
-	resp, err := http.Get(url)
-	if err != nil {
-		return "", err
-	}
-	defer resp.Body.Close()
-
-	// Create the file
-	out, err := os.CreateTemp("", "image")
-	if err != nil {
-		return "", err
-	}
-	defer out.Close()
-
-	// Write the body to file
-	_, err = io.Copy(out, resp.Body)
-	return out.Name(), err
-}
-
 // https://platform.openai.com/docs/api-reference/images/create

 /*
@@ -78,31 +56,12 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx

 		src := ""
 		if input.File != "" {
-
-			fileData := []byte{}
-			// check if input.File is an URL, if so download it and save it
-			// to a temporary file
-			if strings.HasPrefix(input.File, "http://") || strings.HasPrefix(input.File, "https://") {
-				out, err := downloadFile(input.File)
-				if err != nil {
-					return fmt.Errorf("failed downloading file:%w", err)
-				}
-				defer os.RemoveAll(out)
-
-				fileData, err = os.ReadFile(out)
-				if err != nil {
-					return fmt.Errorf("failed reading file:%w", err)
-				}
-
-			} else {
-				// base 64 decode the file and write it somewhere
-				// that we will cleanup
-				fileData, err = base64.StdEncoding.DecodeString(input.File)
-				if err != nil {
-					return err
-				}
+			//base 64 decode the file and write it somewhere
+			// that we will cleanup
+			decoded, err := base64.StdEncoding.DecodeString(input.File)
+			if err != nil {
+				return err
 			}
-
 			// Create a temporary file
 			outputFile, err := os.CreateTemp(o.ImageDir, "b64")
 			if err != nil {
@@ -110,7 +69,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			}
 			// write the base64 result
 			writer := bufio.NewWriter(outputFile)
-			_, err = writer.Write(fileData)
+			_, err = writer.Write(decoded)
 			if err != nil {
 				outputFile.Close()
 				return err
@@ -122,12 +81,8 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx

 		log.Debug().Msgf("Parameter Config: %+v", config)

-		switch config.Backend {
-		case "stablediffusion":
-			config.Backend = model.StableDiffusionBackend
-		case "tinydream":
-			config.Backend = model.TinyDreamBackend
-		case "":
+		// XXX: Only stablediffusion is supported for now
+		if config.Backend == "" {
 			config.Backend = model.StableDiffusionBackend
 		}

@@ -145,7 +100,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 		}

 		b64JSON := false
-		if input.ResponseFormat.Type == "b64_json" {
+		if input.ResponseFormat == "b64_json" {
 			b64JSON = true
 		}
 		// src and clip_skip
--- a/api/options/options.go
+++ b/api/options/options.go
@@ -4,11 +4,10 @@ import (
 	"context"
 	"embed"
 	"encoding/json"
-	"time"

-	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/rs/zerolog/log"
 )

@@ -37,16 +36,7 @@ type Option struct {

 	AutoloadGalleries bool

-	SingleBackend           bool
-	ParallelBackendRequests bool
-
-	WatchDogIdle bool
-	WatchDogBusy bool
-	WatchDog     bool
-
-	ModelsURL []string
-
-	WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
+	SingleBackend bool
 }

 type AppOption func(*Option)
@@ -66,52 +56,16 @@ func NewOptions(o ...AppOption) *Option {
 	return opt
 }

-func WithModelsURL(urls ...string) AppOption {
-	return func(o *Option) {
-		o.ModelsURL = urls
-	}
-}
-
 func WithCors(b bool) AppOption {
 	return func(o *Option) {
 		o.CORS = b
 	}
 }

-var EnableWatchDog = func(o *Option) {
-	o.WatchDog = true
-}
-
-var EnableWatchDogIdleCheck = func(o *Option) {
-	o.WatchDog = true
-	o.WatchDogIdle = true
-}
-
-var EnableWatchDogBusyCheck = func(o *Option) {
-	o.WatchDog = true
-	o.WatchDogBusy = true
-}
-
-func SetWatchDogBusyTimeout(t time.Duration) AppOption {
-	return func(o *Option) {
-		o.WatchDogBusyTimeout = t
-	}
-}
-
-func SetWatchDogIdleTimeout(t time.Duration) AppOption {
-	return func(o *Option) {
-		o.WatchDogIdleTimeout = t
-	}
-}
-
 var EnableSingleBackend = func(o *Option) {
 	o.SingleBackend = true
 }

-var EnableParallelBackendRequests = func(o *Option) {
-	o.ParallelBackendRequests = true
-}
-
 var EnableGalleriesAutoload = func(o *Option) {
 	o.AutoloadGalleries = true
 }
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -83,12 +83,6 @@ type OpenAIModel struct {
 	Object string `json:"object"`
 }

-type ChatCompletionResponseFormatType string
-
-type ChatCompletionResponseFormat struct {
-	Type ChatCompletionResponseFormatType `json:"type,omitempty"`
-}
-
 type OpenAIRequest struct {
 	config.PredictionOptions

@@ -98,7 +92,7 @@ type OpenAIRequest struct {
 	// whisper
 	File string `json:"file" validate:"required"`
 	//whisper/image
-	ResponseFormat ChatCompletionResponseFormat `json:"response_format"`
+	ResponseFormat string `json:"response_format"`
 	// image
 	Size string `json:"size"`
 	// Prompt is read only by completion/image API calls
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -17,17 +17,9 @@ cmake_minimum_required(VERSION 3.15)
 set(TARGET grpc-server)
 set(_PROTOBUF_LIBPROTOBUF libprotobuf)
 set(_REFLECTION grpc++_reflection)
-
 if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    # Set correct Homebrew install folder for Apple Silicon and Intel Macs
-    if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
-        set(HOMEBREW_DEFAULT_PREFIX "/opt/homebrew")
-    else()
-        set(HOMEBREW_DEFAULT_PREFIX "/usr/local")
-    endif()
-
-    link_directories("${HOMEBREW_DEFAULT_PREFIX}/lib")
-    include_directories("${HOMEBREW_DEFAULT_PREFIX}/include")
+    link_directories("/opt/homebrew/lib")
+    include_directories("/opt/homebrew/include")
 endif()

 find_package(absl CONFIG REQUIRED)
@@ -44,7 +36,7 @@ include_directories(${Protobuf_INCLUDE_DIRS})
 message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")

 # Proto file
-get_filename_component(hw_proto "../../../../../../backend/backend.proto" ABSOLUTE)
+get_filename_component(hw_proto "../../../../../../pkg/grpc/proto/backend.proto" ABSOLUTE)
 get_filename_component(hw_proto_path "${hw_proto}" PATH)

 # Generated sources
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=
+LLAMA_VERSION?=d9b33fe95bd257b36c84ee5769cc048230067d6f

 CMAKE_ARGS?=
 BUILD_TYPE?=
@@ -21,9 +21,6 @@ endif

 llama.cpp:
 	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
-	if [ -z "$(LLAMA_VERSION)" ]; then \
-		exit 1; \
-	fi
 	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1

 llama.cpp/examples/grpc-server:
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/go/image/tinydream/tinydream.go
+++ b/backend/go/image/tinydream/tinydream.go
@@ -1,32 +0,0 @@
-package main
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/tinydream"
-)
-
-type Image struct {
-	base.SingleThread
-	tinydream *tinydream.TinyDream
-}
-
-func (image *Image) Load(opts *pb.ModelOptions) error {
-	var err error
-	// Note: the Model here is a path to a directory containing the model files
-	image.tinydream, err = tinydream.New(opts.ModelFile)
-	return err
-}
-
-func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
-	return image.tinydream.GenerateImage(
-		int(opts.Height),
-		int(opts.Width),
-		int(opts.Step),
-		int(opts.Seed),
-		opts.PositivePrompt,
-		opts.NegativePrompt,
-		opts.Dst)
-}
--- a/backend/go/llm/langchain/main.go
+++ b/backend/go/llm/langchain/main.go
@@ -1,21 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/rwkv/main.go
+++ b/backend/go/llm/rwkv/main.go
@@ -1,21 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/transcribe/main.go
+++ b/backend/go/transcribe/main.go
@@ -1,21 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &Whisper{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/tts/main.go
+++ b/backend/go/tts/main.go
@@ -1,21 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &Piper{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,4 +0,0 @@
-.PHONY: autogptq
-autogptq:
-	$(MAKE) -C ../common-env/transformers
-
--- a/backend/python/autogptq/backend_pb2.py
+++ b/backend/python/autogptq/backend_pb2.py
--- a/backend/python/bark/Makefile
+++ b/backend/python/bark/Makefile
@@ -1,15 +0,0 @@
-.PHONY: ttsbark
-ttsbark:
-	$(MAKE) -C ../common-env/transformers
-
-.PHONY: run
-run:
-	@echo "Running bark..."
-	bash run.sh
-	@echo "bark run."
-
-.PHONY: test
-test:
-	@echo "Testing bark..."
-	bash test.sh
-	@echo "bark tested."
--- a/backend/python/bark/backend_pb2.py
+++ b/backend/python/bark/backend_pb2.py
--- a/backend/python/bark/test.py
+++ b/backend/python/bark/test.py
@@ -1,81 +0,0 @@
-"""
-A test script to test the gRPC service
-"""
-import unittest
-import subprocess
-import time
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-
-
-class TestBackendServicer(unittest.TestCase):
-    """
-    TestBackendServicer is the class that tests the gRPC service
-    """
-    def setUp(self):
-        """
-        This method sets up the gRPC service by starting the server
-        """
-        self.service = subprocess.Popen(["python3", "ttsbark.py", "--addr", "localhost:50051"])
-        time.sleep(10)
-
-    def tearDown(self) -> None:
-        """
-        This method tears down the gRPC service by terminating the server
-        """
-        self.service.terminate()
-        self.service.wait()
-
-    def test_server_startup(self):
-        """
-        This method tests if the server starts up successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.Health(backend_pb2.HealthMessage())
-                self.assertEqual(response.message, b'OK')
-        except Exception as err:
-            print(err)
-            self.fail("Server failed to start")
-        finally:
-            self.tearDown()
-
-    def test_load_model(self):
-        """
-        This method tests if the model is loaded successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="v2/en_speaker_4"))
-                self.assertTrue(response.success)
-                self.assertEqual(response.message, "Model loaded successfully")
-        except Exception as err:
-            print(err)
-            self.fail("LoadModel service failed")
-        finally:
-            self.tearDown()
-
-    def test_tts(self):
-        """
-        This method tests if the embeddings are generated successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="v2/en_speaker_4"))
-                self.assertTrue(response.success)
-                tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story")
-                tts_response = stub.TTS(tts_request)
-                self.assertIsNotNone(tts_response)
-        except Exception as err:
-            print(err)
-            self.fail("TTS service failed")
-        finally:
-            self.tearDown()
--- a/backend/python/bark/test.sh
+++ b/backend/python/bark/test.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-##
-## A bash script wrapper that runs the bark server with conda
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python -m unittest $DIR/test.py
--- a/backend/python/common-env/transformers/Makefile
+++ b/backend/python/common-env/transformers/Makefile
@@ -1,10 +0,0 @@
-CONDA_ENV_PATH = "transformers.yml"
-
-ifeq ($(BUILD_TYPE), cublas)
-	CONDA_ENV_PATH = "transformers-nvidia.yml"
-endif
-
-.PHONY: transformers
-transformers:
-	@echo "Installing $(CONDA_ENV_PATH)..."
-	bash install.sh $(CONDA_ENV_PATH)
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -1,24 +0,0 @@
-#!/bin/bash
-set -ex
-
-# Check if environment exist
-conda_env_exists(){
-    ! conda list --name "${@}" >/dev/null 2>/dev/null
-}
-
-if conda_env_exists "transformers" ; then
-    echo "Creating virtual environment..."
-    conda env create --name transformers --file $1
-    echo "Virtual environment created."
-else 
-    echo "Virtual environment already exists."
-fi
-
-if [ "$PIP_CACHE_PURGE" = true ] ; then
-    export PATH=$PATH:/opt/conda/bin
-
-    # Activate conda environment
-    source activate transformers
-
-    pip cache purge
-fi
--- a/backend/python/coqui/Makefile
+++ b/backend/python/coqui/Makefile
@@ -1,15 +0,0 @@
-.PHONY: coqui
-coqui:
-	$(MAKE) -C ../common-env/transformers
-
-.PHONY: run
-run:
-	@echo "Running coqui..."
-	bash run.sh
-	@echo "coqui run."
-
-.PHONY: test
-test:
-	@echo "Testing coqui..."
-	bash test.sh
-	@echo "coqui tested."
--- a/backend/python/coqui/README.md
+++ b/backend/python/coqui/README.md
@@ -1,11 +0,0 @@
-# Creating a separate environment for ttsbark project
-
-```
-make coqui
-```
-
-# Testing the gRPC server
-
-```
-make test
-```
--- a/backend/python/coqui/backend_pb2.py
+++ b/backend/python/coqui/backend_pb2.py
--- a/backend/python/coqui/coqui_server.py
+++ b/backend/python/coqui/coqui_server.py
@@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-"""
-This is an extra gRPC server of LocalAI for Bark TTS
-"""
-from concurrent import futures
-import time
-import argparse
-import signal
-import sys
-import os
-import backend_pb2
-import backend_pb2_grpc
-
-import torch
-from TTS.api import TTS
-
-import grpc
-
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', None)
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    """
-    BackendServicer is the class that implements the gRPC service
-    """
-    def Health(self, request, context):
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-    def LoadModel(self, request, context):
-
-        # Get device
-        device = "cuda" if request.CUDA else "cpu"
-
-        if not torch.cuda.is_available() and request.CUDA:
-            return backend_pb2.Result(success=False, message="CUDA is not available")
-
-        self.AudioPath = None
-        # List available 🐸TTS models
-        print(TTS().list_models())
-        if os.path.isabs(request.AudioPath):
-            self.AudioPath = request.AudioPath
-        elif request.AudioPath and request.ModelFile != "" and not os.path.isabs(request.AudioPath):
-            # get base path of modelFile
-            modelFileBase = os.path.dirname(request.ModelFile)
-            # modify LoraAdapter to be relative to modelFileBase
-            self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
-
-        try:
-            print("Preparing models, please wait", file=sys.stderr)
-            self.tts = TTS(request.Model).to(device)
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        # Implement your logic here for the LoadModel service
-        # Replace this with your desired response
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def TTS(self, request, context):
-        try:
-            self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst)
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(success=True)
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
--- a/backend/python/coqui/test.py
+++ b/backend/python/coqui/test.py
@@ -1,82 +0,0 @@
-"""
-A test script to test the gRPC service
-"""
-import unittest
-import subprocess
-import time
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-
-
-class TestBackendServicer(unittest.TestCase):
-    """
-    TestBackendServicer is the class that tests the gRPC service
-    """
-    def setUp(self):
-        """
-        This method sets up the gRPC service by starting the server
-        """
-        self.service = subprocess.Popen(["python3", "coqui_server.py", "--addr", "localhost:50051"])
-        time.sleep(10)
-
-    def tearDown(self) -> None:
-        """
-        This method tears down the gRPC service by terminating the server
-        """
-        self.service.terminate()
-        self.service.wait()
-
-    def test_server_startup(self):
-        """
-        This method tests if the server starts up successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.Health(backend_pb2.HealthMessage())
-                self.assertEqual(response.message, b'OK')
-        except Exception as err:
-            print(err)
-            self.fail("Server failed to start")
-        finally:
-            self.tearDown()
-
-    def test_load_model(self):
-        """
-        This method tests if the model is loaded successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="tts_models/en/vctk/vits"))
-                print(response)
-                self.assertTrue(response.success)
-                self.assertEqual(response.message, "Model loaded successfully")
-        except Exception as err:
-            print(err)
-            self.fail("LoadModel service failed")
-        finally:
-            self.tearDown()
-
-    def test_tts(self):
-        """
-        This method tests if the embeddings are generated successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="tts_models/en/vctk/vits"))
-                self.assertTrue(response.success)
-                tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story")
-                tts_response = stub.TTS(tts_request)
-                self.assertIsNotNone(tts_response)
-        except Exception as err:
-            print(err)
-            self.fail("TTS service failed")
-        finally:
-            self.tearDown()
--- a/backend/python/diffusers/backend_pb2.py
+++ b/backend/python/diffusers/backend_pb2.py
--- a/backend/python/diffusers/test.py
+++ b/backend/python/diffusers/test.py
@@ -1,84 +0,0 @@
-"""
-A test script to test the gRPC service
-"""
-import unittest
-import subprocess
-import time
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-
-
-class TestBackendServicer(unittest.TestCase):
-    """
-    TestBackendServicer is the class that tests the gRPC service
-    """
-    def setUp(self):
-        """
-        This method sets up the gRPC service by starting the server
-        """
-        self.service = subprocess.Popen(["python3", "backend_diffusers.py", "--addr", "localhost:50051"])
-
-    def tearDown(self) -> None:
-        """
-        This method tears down the gRPC service by terminating the server
-        """
-        self.service.kill()
-        self.service.wait()
-
-    def test_server_startup(self):
-        """
-        This method tests if the server starts up successfully
-        """
-        time.sleep(10)
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.Health(backend_pb2.HealthMessage())
-                self.assertEqual(response.message, b'OK')
-        except Exception as err:
-            print(err)
-            self.fail("Server failed to start")
-        finally:
-            self.tearDown()
-
-    def test_load_model(self):
-        """
-        This method tests if the model is loaded successfully
-        """
-        time.sleep(10)
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
-                self.assertTrue(response.success)
-                self.assertEqual(response.message, "Model loaded successfully")
-        except Exception as err:
-            print(err)
-            self.fail("LoadModel service failed")
-        finally:
-            self.tearDown()
-
-    def test(self):
-        """
-        This method tests if the backend can generate images
-        """
-        time.sleep(10)
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
-                print(response.message)
-                self.assertTrue(response.success)
-                image_req = backend_pb2.GenerateImageRequest(positive_prompt="cat", width=16,height=16, dst="test.jpg")
-                re = stub.GenerateImage(image_req)
-                self.assertTrue(re.success)
-        except Exception as err:
-            print(err)
-            self.fail("Image gen service failed")
-        finally:
-            self.tearDown()
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -1,10 +0,0 @@
-.PHONY: exllama
-exllama:
-	$(MAKE) -C ../common-env/transformers
-	bash install.sh
-
-.PHONY: run
-run:
-	@echo "Running exllama..."
-	bash run.sh
-	@echo "exllama run."
--- a/backend/python/exllama/backend_pb2.py
+++ b/backend/python/exllama/backend_pb2.py
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-##
-## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-echo $CONDA_PREFIX
-
-
-git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd
-
-cp -rfv $CONDA_PREFIX/exllama/* ./
-
-if [ "$PIP_CACHE_PURGE" = true ] ; then
-    pip cache purge
-fi
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -1,10 +0,0 @@
-.PHONY: exllama2
-exllama2:
-	$(MAKE) -C ../common-env/transformers
-	bash install.sh
-
-.PHONY: run
-run:
-	@echo "Running exllama2..."
-	bash run.sh
-	@echo "exllama2 run."
--- a/backend/python/exllama2/backend_pb2.py
+++ b/backend/python/exllama2/backend_pb2.py
--- a/backend/python/exllama2/exllama2_backend.py
+++ b/backend/python/exllama2/exllama2_backend.py
@@ -1,138 +0,0 @@
-#!/usr/bin/env python3
-import grpc
-from concurrent import futures
-import time
-import backend_pb2
-import backend_pb2_grpc
-import argparse
-import signal
-import sys
-import os
-import glob
-
-from pathlib import Path
-import torch
-import torch.nn.functional as F
-from torch import version as torch_version
-
-
-from exllamav2.generator import (
-    ExLlamaV2BaseGenerator,
-    ExLlamaV2Sampler
-)
-
-
-from exllamav2 import (
-    ExLlamaV2,
-    ExLlamaV2Config,
-    ExLlamaV2Cache,
-    ExLlamaV2Cache_8bit,
-    ExLlamaV2Tokenizer,
-    model_init,
-)
-
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    def Health(self, request, context):
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-
-    def LoadModel(self, request, context):
-        try:
-            model_directory = request.ModelFile
-
-            config = ExLlamaV2Config()
-            config.model_dir = model_directory
-            config.prepare()
-
-            model = ExLlamaV2(config)
-
-            cache = ExLlamaV2Cache(model, lazy=True)
-            model.load_autosplit(cache)
-
-            tokenizer = ExLlamaV2Tokenizer(config)
-
-            # Initialize generator
-
-            generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
-
-            self.generator = generator
-
-            generator.warmup()
-            self.model = model
-            self.tokenizer = tokenizer
-            self.cache = cache
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Predict(self, request, context):
-
-        penalty = 1.15
-        if request.Penalty != 0.0:
-            penalty = request.Penalty
-
-        settings = ExLlamaV2Sampler.Settings()
-        settings.temperature = request.Temperature
-        settings.top_k = request.TopK
-        settings.top_p = request.TopP
-        settings.token_repetition_penalty = penalty
-        settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
-        tokens = 512
-
-        if request.Tokens != 0:
-            tokens = request.Tokens
-        output = self.generator.generate_simple(
-            request.Prompt, settings, tokens)
-
-        # Remove prompt from response if present
-        if request.Prompt in output:
-            output = output.replace(request.Prompt, "")
-
-        return backend_pb2.Result(message=bytes(output, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
-        # Implement PredictStream RPC
-        # for reply in some_data_generator():
-        #    yield reply
-        # Not implemented yet
-        return self.Predict(request, context)
-
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
--- a/backend/python/exllama2/install.sh
+++ b/backend/python/exllama2/install.sh
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-##
-## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-echo $CONDA_PREFIX
-
-git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2 && pushd $CONDA_PREFIX/exllamav2 && pip install -r requirements.txt && popd
-
-cp -rfv $CONDA_PREFIX/exllamav2/* ./  
-
-if [ "$PIP_CACHE_PURGE" = true ] ; then
-    pip cache purge
-fi
--- a/backend/python/exllama2/run.sh
+++ b/backend/python/exllama2/run.sh
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-##
-## A bash script wrapper that runs the exllama server with conda
-
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-cd $DIR
-
-python $DIR/exllama2_backend.py $@
--- a/backend/python/petals/Makefile
+++ b/backend/python/petals/Makefile
@@ -1,17 +0,0 @@
-.PHONY: petals
-petals:
-	@echo "Creating virtual environment..."
-	@conda env create --name petals --file petals.yml
-	@echo "Virtual environment created."
-
-.PHONY: run
-run:
-	@echo "Running petals..."
-	bash run.sh
-	@echo "petals run."
-
-.PHONY: test
-test:
-	@echo "Testing petals..."
-	bash test.sh
-	@echo "petals tested."
--- a/backend/python/petals/backend_pb2.py
+++ b/backend/python/petals/backend_pb2.py
--- a/backend/python/petals/backend_petals.py
+++ b/backend/python/petals/backend_petals.py
@@ -1,140 +0,0 @@
-#!/usr/bin/env python3
-from concurrent import futures
-import time
-import argparse
-import signal
-import sys
-import os
-
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-import torch
-from transformers import AutoTokenizer
-from petals import AutoDistributedModelForCausalLM
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    """
-    A gRPC servicer that implements the Backend service defined in backend.proto.
-    """
-    def Health(self, request, context):
-        """
-        Returns a health check message.
-
-        Args:
-            request: The health check request.
-            context: The gRPC context.
-
-        Returns:
-            backend_pb2.Reply: The health check reply.
-        """
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-
-    def LoadModel(self, request, context):
-        """
-        Loads a language model.
-
-        Args:
-            request: The load model request.
-            context: The gRPC context.
-
-        Returns:
-            backend_pb2.Result: The load model result.
-        """
-        try:
-            self.tokenizer = AutoTokenizer.from_pretrained(request.Model, use_fast=False, add_bos_token=False)
-            self.model = AutoDistributedModelForCausalLM.from_pretrained(request.Model)
-            self.cuda = False
-            if request.CUDA:
-                self.model = self.model.cuda()
-                self.cuda = True
-
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Predict(self, request, context):
-        """
-        Generates text based on the given prompt and sampling parameters.
-
-        Args:
-            request: The predict request.
-            context: The gRPC context.
-
-        Returns:
-            backend_pb2.Result: The predict result.
-        """
-
-        inputs = self.tokenizer(request.Prompt, return_tensors="pt")["input_ids"]
-        if self.cuda:
-            inputs = inputs.cuda()
- 
-        if request.Tokens == 0:
-            # Max to max value if tokens are not specified
-            request.Tokens = 8192
-
-        # TODO: kwargs and map all parameters
-        outputs = self.model.generate(inputs, max_new_tokens=request.Tokens)
-
-        generated_text = self.tokenizer.decode(outputs[0])
-        # Remove prompt from response if present
-        if request.Prompt in generated_text:
-            generated_text = generated_text.replace(request.Prompt, "")
-
-        return backend_pb2.Result(message=bytes(generated_text, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
-        """
-        Generates text based on the given prompt and sampling parameters, and streams the results.
-
-        Args:
-            request: The predict stream request.
-            context: The gRPC context.
-
-        Returns:
-            backend_pb2.Result: The predict stream result.
-        """
-        # Implement PredictStream RPC
-        #for reply in some_data_generator():
-        #    yield reply
-        # Not implemented yet
-        return self.Predict(request, context)
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
--- a/backend/python/petals/petals.yml
+++ b/backend/python/petals/petals.yml
@@ -1,30 +0,0 @@
-name: petals
-channels:
-  - defaults
-dependencies:
-  # - _libgcc_mutex=0.1=main
-  # - _openmp_mutex=5.1=1_gnu
-  # - bzip2=1.0.8=h7b6447c_0
-  # - ca-certificates=2023.08.22=h06a4308_0
-  # - ld_impl_linux-64=2.38=h1181459_1
-  # - libffi=3.4.4=h6a678d5_0
-  # - libgcc-ng=11.2.0=h1234567_1
-  # - libgomp=11.2.0=h1234567_1
-  # - libstdcxx-ng=11.2.0=h1234567_1
-  # - libuuid=1.41.5=h5eee18b_0
-  # - ncurses=6.4=h6a678d5_0
-  # - openssl=3.0.11=h7f8727e_2
-  # - pip=23.2.1=py311h06a4308_0
-  - python=3.11.5=h955ad1f_0
-  # - readline=8.2=h5eee18b_0
-  # - setuptools=68.0.0=py311h06a4308_0
-  # - sqlite=3.41.2=h5eee18b_0
-  # - tk=8.6.12=h1ccaba5_0
-  # - tzdata=2023c=h04d1e81_0
-  # - wheel=0.41.2=py311h06a4308_0
-  # - xz=5.4.2=h5eee18b_0
-  # - zlib=1.2.13=h5eee18b_0
-  - pip:
-      - torch==2.1.0
-      - git+https://github.com/bigscience-workshop/petals
-prefix: /opt/conda/envs/petals
--- a/backend/python/petals/run.sh
+++ b/backend/python/petals/run.sh
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-##
-## A bash script wrapper that runs the exllama server with conda
-
-export PATH=$PATH:/opt/conda/bin
-
-CONDA_ENV=petals
-
-# Activate conda environment
-# if source is available use it, or use conda
-#
-if [ -f /opt/conda/bin/activate ]; then
-    source activate $CONDA_ENV
-else
-    eval "$(conda shell.bash hook)"
-    conda activate $CONDA_ENV
-fi
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python $DIR/backend_petals.py $@
--- a/backend/python/petals/test.sh
+++ b/backend/python/petals/test.sh
@@ -1,20 +0,0 @@
-#!/bin/bash
-##
-## A bash script wrapper that runs the transformers server with conda
-
-# Activate conda environment
-CONDA_ENV=petals
-# Activate conda environment
-# if source is available use it, or use conda
-#
-if [ -f /opt/conda/bin/activate ]; then
-    source activate $CONDA_ENV
-else
-    eval "$(conda shell.bash hook)"
-    conda activate $CONDA_ENV
-fi
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python -m unittest $DIR/test_petals.py
--- a/backend/python/sentencetransformers/Makefile
+++ b/backend/python/sentencetransformers/Makefile
@@ -1,17 +0,0 @@
-.PHONY: sentencetransformers
-sentencetransformers:
-	$(MAKE) -C ../common-env/transformers
-
-
-.PHONY: run
-run:
-	@echo "Running sentencetransformers..."
-	bash run.sh
-	@echo "sentencetransformers run."
-
-# It is not working well by using command line. It only6 works with IDE like VSCode.
-.PHONY: test
-test:
-	@echo "Testing sentencetransformers..."
-	bash test.sh
-	@echo "sentencetransformers tested."
--- a/backend/python/sentencetransformers/README.md
+++ b/backend/python/sentencetransformers/README.md
@@ -1,5 +0,0 @@
-# Creating a separate environment for the sentencetransformers project
-
-```
-make sentencetransformers
-```
--- a/backend/python/sentencetransformers/backend_pb2.py
+++ b/backend/python/sentencetransformers/backend_pb2.py
--- a/backend/python/sentencetransformers/backend_pb2_grpc.py
+++ b/backend/python/sentencetransformers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/sentencetransformers/run.sh
+++ b/backend/python/sentencetransformers/run.sh
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-##
-## A bash script wrapper that runs the sentencetransformers server with conda
-
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python $DIR/sentencetransformers.py $@
--- a/backend/python/sentencetransformers/test.sh
+++ b/backend/python/sentencetransformers/test.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-##
-## A bash script wrapper that runs the sentencetransformers server with conda
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python -m unittest $DIR/test_sentencetransformers.py
--- a/backend/python/transformers-musicgen/Makefile
+++ b/backend/python/transformers-musicgen/Makefile
@@ -1,16 +0,0 @@
-
-.PHONY: transformers-musicgen
-transformers-musicgen:
-	$(MAKE) -C ../common-env/transformers
-
-.PHONY: run
-run:
-	@echo "Running transformers..."
-	bash run.sh
-	@echo "transformers run."
-
-.PHONY: test
-test:
-	@echo "Testing transformers..."
-	bash test.sh
-	@echo "transformers tested."
--- a/backend/python/transformers-musicgen/README.md
+++ b/backend/python/transformers-musicgen/README.md
@@ -1,5 +0,0 @@
-# Creating a separate environment for the transformers project
-
-```
-make transformers-musicgen
-```
--- a/backend/python/transformers-musicgen/backend_pb2.py
+++ b/backend/python/transformers-musicgen/backend_pb2.py
--- a/backend/python/transformers-musicgen/backend_pb2_grpc.py
+++ b/backend/python/transformers-musicgen/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers-musicgen/run.sh
+++ b/backend/python/transformers-musicgen/run.sh
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-##
-## A bash script wrapper that runs the transformers-musicgen server with conda
-
-echo "Launching gRPC server for transformers-musicgen"
-
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers-musicgen
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python $DIR/transformers_server.py $@
--- a/backend/python/transformers-musicgen/test.sh
+++ b/backend/python/transformers-musicgen/test.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-##
-## A bash script wrapper that runs the transformers server with conda
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python -m unittest $DIR/test_transformers.py
--- a/backend/python/transformers-musicgen/test_transformers.py
+++ b/backend/python/transformers-musicgen/test_transformers.py
@@ -1,81 +0,0 @@
-"""
-A test script to test the gRPC service
-"""
-import unittest
-import subprocess
-import time
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-
-
-class TestBackendServicer(unittest.TestCase):
-    """
-    TestBackendServicer is the class that tests the gRPC service
-    """
-    def setUp(self):
-        """
-        This method sets up the gRPC service by starting the server
-        """
-        self.service = subprocess.Popen(["python3", "transformers_server.py", "--addr", "localhost:50051"])
-        time.sleep(10)
-
-    def tearDown(self) -> None:
-        """
-        This method tears down the gRPC service by terminating the server
-        """
-        self.service.terminate()
-        self.service.wait()
-
-    def test_server_startup(self):
-        """
-        This method tests if the server starts up successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.Health(backend_pb2.HealthMessage())
-                self.assertEqual(response.message, b'OK')
-        except Exception as err:
-            print(err)
-            self.fail("Server failed to start")
-        finally:
-            self.tearDown()
-
-    def test_load_model(self):
-        """
-        This method tests if the model is loaded successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/musicgen-small"))
-                self.assertTrue(response.success)
-                self.assertEqual(response.message, "Model loaded successfully")
-        except Exception as err:
-            print(err)
-            self.fail("LoadModel service failed")
-        finally:
-            self.tearDown()
-
-    def test_tts(self):
-        """
-        This method tests if the embeddings are generated successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/musicgen-small"))
-                self.assertTrue(response.success)
-                tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story")
-                tts_response = stub.TTS(tts_request)
-                self.assertIsNotNone(tts_response)
-        except Exception as err:
-            print(err)
-            self.fail("TTS service failed")
-        finally:
-            self.tearDown()
--- a/backend/python/transformers-musicgen/transformers_server.py
+++ b/backend/python/transformers-musicgen/transformers_server.py
@@ -1,122 +0,0 @@
-#!/usr/bin/env python3
-"""
-Extra gRPC server for MusicgenForConditionalGeneration models.
-"""
-from concurrent import futures
-
-import argparse
-import signal
-import sys
-import os
-
-import time
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-
-from scipy.io.wavfile import write as write_wav
-from transformers import AutoProcessor, MusicgenForConditionalGeneration
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    """
-    A gRPC servicer for the backend service.
-
-    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
-    """
-    def Health(self, request, context):
-        """
-        A gRPC method that returns the health status of the backend service.
-
-        Args:
-            request: A HealthRequest object that contains the request parameters.
-            context: A grpc.ServicerContext object that provides information about the RPC.
-
-        Returns:
-            A Reply object that contains the health status of the backend service.
-        """
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-
-    def LoadModel(self, request, context):
-        """
-        A gRPC method that loads a model into memory.
-
-        Args:
-            request: A LoadModelRequest object that contains the request parameters.
-            context: A grpc.ServicerContext object that provides information about the RPC.
-
-        Returns:
-            A Result object that contains the result of the LoadModel operation.
-        """
-        model_name = request.Model
-        try:
-            self.processor = AutoProcessor.from_pretrained(model_name)
-            self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def TTS(self, request, context):
-        model_name = request.model
-        if model_name == "":
-            return backend_pb2.Result(success=False, message="request.model is required")
-        try:
-            self.processor = AutoProcessor.from_pretrained(model_name)
-            self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)
-            inputs = self.processor(
-                text=[request.text],
-                padding=True,
-                return_tensors="pt",
-            )
-            tokens = 256
-            # TODO get tokens from request?
-            audio_values = self.model.generate(**inputs, max_new_tokens=tokens)
-            print("[transformers-musicgen] TTS generated!", file=sys.stderr)
-            sampling_rate = self.model.config.audio_encoder.sampling_rate
-            write_wav(request.dst, rate=sampling_rate, data=audio_values[0, 0].numpy())
-            print("[transformers-musicgen] TTS saved to", request.dst, file=sys.stderr)
-            print("[transformers-musicgen] TTS for", file=sys.stderr)
-            print(request, file=sys.stderr)
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(success=True)
-
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("[transformers-musicgen] Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("[transformers-musicgen] Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-    print(f"[transformers-musicgen] startup: {args}", file=sys.stderr)
-    serve(args.addr)
--- a/backend/python/transformers/Makefile
+++ b/backend/python/transformers/Makefile
@@ -1,16 +0,0 @@
-.PHONY: transformers
-transformers:
-	$(MAKE) -C ../common-env/transformers
-
-.PHONY: run
-run:
-	@echo "Running transformers..."
-	bash run.sh
-	@echo "transformers run."
-
-# It is not working well by using command line. It only6 works with IDE like VSCode.
-.PHONY: test
-test:
-	@echo "Testing transformers..."
-	bash test.sh
-	@echo "transformers tested."
--- a/backend/python/transformers/README.md
+++ b/backend/python/transformers/README.md
@@ -1,5 +0,0 @@
-# Creating a separate environment for the transformers project
-
-```
-make transformers
-```
--- a/backend/python/transformers/backend_pb2.py
+++ b/backend/python/transformers/backend_pb2.py
--- a/backend/python/transformers/backend_pb2_grpc.py
+++ b/backend/python/transformers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers/run.sh
+++ b/backend/python/transformers/run.sh
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-##
-## A bash script wrapper that runs the transformers server with conda
-
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python $DIR/transformers_server.py $@
--- a/backend/python/transformers/test.sh
+++ b/backend/python/transformers/test.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-##
-## A bash script wrapper that runs the transformers server with conda
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python -m unittest $DIR/test_transformers_server.py
--- a/backend/python/transformers/test_transformers_server.py
+++ b/backend/python/transformers/test_transformers_server.py
@@ -1,84 +0,0 @@
-"""
-A test script to test the gRPC service
-"""
-import unittest
-import subprocess
-import time
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-
-
-class TestBackendServicer(unittest.TestCase):
-    """
-    TestBackendServicer is the class that tests the gRPC service
-    """
-    def setUp(self):
-        """
-        This method sets up the gRPC service by starting the server
-        """
-        self.service = subprocess.Popen(["python3", "transformers_server.py", "--addr", "localhost:50051"])
-
-    def tearDown(self) -> None:
-        """
-        This method tears down the gRPC service by terminating the server
-        """
-        self.service.kill()
-        self.service.wait()
-
-    def test_server_startup(self):
-        """
-        This method tests if the server starts up successfully
-        """
-        time.sleep(10)
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.Health(backend_pb2.HealthMessage())
-                self.assertEqual(response.message, b'OK')
-        except Exception as err:
-            print(err)
-            self.fail("Server failed to start")
-        finally:
-            self.tearDown()
-
-    def test_load_model(self):
-        """
-        This method tests if the model is loaded successfully
-        """
-        time.sleep(10)
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-cased"))
-                self.assertTrue(response.success)
-                self.assertEqual(response.message, "Model loaded successfully")
-        except Exception as err:
-            print(err)
-            self.fail("LoadModel service failed")
-        finally:
-            self.tearDown()
-
-    def test_embedding(self):
-        """
-        This method tests if the embeddings are generated successfully
-        """
-        time.sleep(10)
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-cased"))
-                print(response.message)
-                self.assertTrue(response.success)
-                embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
-                embedding_response = stub.Embedding(embedding_request)
-                self.assertIsNotNone(embedding_response.embeddings)
-        except Exception as err:
-            print(err)
-            self.fail("Embedding service failed")
-        finally:
-            self.tearDown()
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -1,147 +0,0 @@
-#!/usr/bin/env python3
-"""
-Extra gRPC server for HuggingFace AutoModel models.
-"""
-from concurrent import futures
-
-import argparse
-import signal
-import sys
-import os
-
-import time
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-import torch
-
-from transformers import AutoTokenizer, AutoModel
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-
-def mean_pooling(model_output, attention_mask):
-    """
-    Mean pooling to get sentence embeddings. See:
-    https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
-    """
-    token_embeddings = model_output[0]
-    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns
-    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-    return sum_embeddings / sum_mask
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    """
-    A gRPC servicer for the backend service.
-
-    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
-    """
-    def Health(self, request, context):
-        """
-        A gRPC method that returns the health status of the backend service.
-
-        Args:
-            request: A HealthRequest object that contains the request parameters.
-            context: A grpc.ServicerContext object that provides information about the RPC.
-
-        Returns:
-            A Reply object that contains the health status of the backend service.
-        """
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-
-    def LoadModel(self, request, context):
-        """
-        A gRPC method that loads a model into memory.
-
-        Args:
-            request: A LoadModelRequest object that contains the request parameters.
-            context: A grpc.ServicerContext object that provides information about the RPC.
-
-        Returns:
-            A Result object that contains the result of the LoadModel operation.
-        """
-        model_name = request.Model
-        try:
-            self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True) # trust_remote_code is needed to use the encode method with embeddings models like jinai-v2
-            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-            if request.CUDA:
-                try:
-                    # TODO: also tensorflow, make configurable
-                    import torch.cuda
-                    if torch.cuda.is_available():
-                        print("Loading model", model_name, "to CUDA.", file=sys.stderr)
-                        self.model = self.model.to("cuda")
-                except Exception as err:
-                    print("Not using CUDA:", err, file=sys.stderr)
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        # Implement your logic here for the LoadModel service
-        # Replace this with your desired response
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Embedding(self, request, context):
-        """
-        A gRPC method that calculates embeddings for a given sentence.
-
-        Args:
-            request: An EmbeddingRequest object that contains the request parameters.
-            context: A grpc.ServicerContext object that provides information about the RPC.
-
-        Returns:
-            An EmbeddingResult object that contains the calculated embeddings.
-        """
-
-        # Tokenize input
-        max_length = 512
-        if request.Tokens != 0:
-            max_length = request.Tokens
-        encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")    
-
-        # Create word embeddings
-        model_output = self.model(**encoded_input)
-
-        # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
-        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
-        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
-        print("Embeddings:", sentence_embeddings, file=sys.stderr)
-        return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings)
-
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
--- a/backend/python/vall-e-x/Makefile
+++ b/backend/python/vall-e-x/Makefile
@@ -1,16 +0,0 @@
-.PHONY: ttsvalle
-ttsvalle:
-	$(MAKE) -C ../common-env/transformers
-	bash install.sh
-
-.PHONY: run
-run:
-	@echo "Running ttsvalle..."
-	bash run.sh
-	@echo "ttsvalle run."
-
-.PHONY: test
-test:
-	@echo "Testing valle..."
-	bash test.sh
-	@echo "valle tested."
--- a/backend/python/vall-e-x/backend_pb2.py
+++ b/backend/python/vall-e-x/backend_pb2.py
--- a/backend/python/vall-e-x/backend_pb2_grpc.py
+++ b/backend/python/vall-e-x/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/vall-e-x/install.sh
+++ b/backend/python/vall-e-x/install.sh
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-##
-## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin
-export SHA=3faaf8ccadb154d63b38070caf518ce9309ea0f4
-
-# Activate conda environment
-source activate transformers
-
-echo $CONDA_PREFIX
-
-git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && pip install -r requirements.txt && popd
-
-cp -rfv $CONDA_PREFIX/vall-e-x/* ./
-
-if [ "$PIP_CACHE_PURGE" = true ] ; then
-    pip cache purge
-fi
--- a/backend/python/vall-e-x/test.py
+++ b/backend/python/vall-e-x/test.py
@@ -1,81 +0,0 @@
-"""
-A test script to test the gRPC service
-"""
-import unittest
-import subprocess
-import time
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-
-
-class TestBackendServicer(unittest.TestCase):
-    """
-    TestBackendServicer is the class that tests the gRPC service
-    """
-    def setUp(self):
-        """
-        This method sets up the gRPC service by starting the server
-        """
-        self.service = subprocess.Popen(["python3", "ttsvalle.py", "--addr", "localhost:50051"])
-        time.sleep(10)
-
-    def tearDown(self) -> None:
-        """
-        This method tears down the gRPC service by terminating the server
-        """
-        self.service.terminate()
-        self.service.wait()
-
-    def test_server_startup(self):
-        """
-        This method tests if the server starts up successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.Health(backend_pb2.HealthMessage())
-                self.assertEqual(response.message, b'OK')
-        except Exception as err:
-            print(err)
-            self.fail("Server failed to start")
-        finally:
-            self.tearDown()
-
-    def test_load_model(self):
-        """
-        This method tests if the model is loaded successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="dingzhen"))
-                self.assertTrue(response.success)
-                self.assertEqual(response.message, "Model loaded successfully")
-        except Exception as err:
-            print(err)
-            self.fail("LoadModel service failed")
-        finally:
-            self.tearDown()
-
-    def test_tts(self):
-        """
-        This method tests if the embeddings are generated successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="dingzhen"))
-                self.assertTrue(response.success)
-                tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story")
-                tts_response = stub.TTS(tts_request)
-                self.assertIsNotNone(tts_response)
-        except Exception as err:
-            print(err)
-            self.fail("TTS service failed")
-        finally:
-            self.tearDown()
--- a/backend/python/vall-e-x/test.sh
+++ b/backend/python/vall-e-x/test.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-##
-## A bash script wrapper that runs the ttsvalle server with conda
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python -m unittest $DIR/test.py
--- a/backend/python/vllm/Makefile
+++ b/backend/python/vllm/Makefile
@@ -1,15 +0,0 @@
-.PHONY: vllm
-vllm:
-	$(MAKE) -C ../common-env/transformers
-
-.PHONY: run
-run:
-	@echo "Running vllm..."
-	bash run.sh
-	@echo "vllm run."
-
-.PHONY: test
-test:
-	@echo "Testing vllm..."
-	bash test.sh
-	@echo "vllm tested."
--- a/backend/python/vllm/backend_pb2.py
+++ b/backend/python/vllm/backend_pb2.py
--- a/backend/python/vllm/backend_pb2_grpc.py
+++ b/backend/python/vllm/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/vllm/run.sh
+++ b/backend/python/vllm/run.sh
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-##
-## A bash script wrapper that runs the diffusers server with conda
-
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python $DIR/backend_vllm.py $@
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ettore Di Giacinto	a8e91345e2	fixup: create piper libdir also when not built Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-11-12 22:17:11 +01:00
Ettore Di Giacinto	ea4ade6b60	Merge branch 'master' into enable_gpu	2023-11-11 19:20:36 +01:00
Ettore Di Giacinto	796d0c99aa	tests: re-enable GPU tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-11-06 19:10:35 +01:00