feat(ui): support multilineand style ul (#2226 )

* feat(ui/chat): handle multiline in the input field Signed-off-by: mudler <mudler@localai.io> * feat(ui/chat): correctly display multiline messages Signed-off-by: mudler <mudler@localai.io> * feat(ui/chat): add list style Signed-off-by: mudler <mudler@localai.io> --------- Signed-off-by: mudler <mudler@localai.io>
feat(aio): switch to llama3-based for LLM (#2225 )
2026-02-03 03:02:38 -05:00 · 2024-05-03 00:43:02 +02:00 · 2024-05-03 00:41:45 +02:00 · 2024-05-02 21:23:40 +00:00 · 2024-05-02 21:14:10 +02:00 · 2024-05-02 18:31:13 +02:00
264 changed files with 11028 additions and 12244 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,6 +1,11 @@
 .idea
+.github
+.vscode
 models
 examples/chatbot-ui/models
 examples/rwkv/models
 examples/**/models
-Dockerfile*
+Dockerfile*
+
+# SonarQube
+.scannerwork
--- a/.env
+++ b/.env
@@ -1,33 +1,33 @@
 ## Set number of threads.
 ## Note: prefer the number of physical cores. Overbooking the CPU degrades performance notably.
-# THREADS=14
+# LOCALAI_THREADS=14

 ## Specify a different bind address (defaults to ":8080")
-# ADDRESS=127.0.0.1:8080
+# LOCALAI_ADDRESS=127.0.0.1:8080

 ## Default models context size
-# CONTEXT_SIZE=512
+# LOCALAI_CONTEXT_SIZE=512
 #
 ## Define galleries.
 ## models will to install will be visible in `/models/available`
-# GALLERIES=[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}]
+# LOCALAI_GALLERIES=[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]

 ## CORS settings
-# CORS=true
-# CORS_ALLOW_ORIGINS=*
+# LOCALAI_CORS=true
+# LOCALAI_CORS_ALLOW_ORIGINS=*

 ## Default path for models
 #
-# MODELS_PATH=/models
+# LOCALAI_MODELS_PATH=/models

 ## Enable debug mode
-# DEBUG=true
+# LOCALAI_LOG_LEVEL=debug

 ## Disables COMPEL (Diffusers)
 # COMPEL=0

 ## Enable/Disable single backend (useful if only one GPU is available)
-# SINGLE_ACTIVE_BACKEND=true
+# LOCALAI_SINGLE_ACTIVE_BACKEND=true

 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
@@ -46,13 +46,13 @@
 # GO_TAGS=stablediffusion

 ## Path where to store generated images
-# IMAGE_PATH=/tmp
+# LOCALAI_IMAGE_PATH=/tmp/generated/images

 ## Specify a default upload limit in MB (whisper)
-# UPLOAD_LIMIT
+# LOCALAI_UPLOAD_LIMIT=15

 ## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
-# EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
+# LOCALAI_EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py

 ### Advanced settings ###
 ### Those are not really used by LocalAI, but from components in the stack ###
@@ -72,18 +72,18 @@
 # LLAMACPP_PARALLEL=1

 ### Enable to run parallel requests
-# PARALLEL_REQUESTS=true
+# LOCALAI_PARALLEL_REQUESTS=true

 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
-# WATCHDOG_IDLE=true
-#
-# Enables watchdog to kill backends that are busy for too much time
-# WATCHDOG_BUSY=true
+# LOCALAI_WATCHDOG_IDLE=true
 #
 # Time in duration format (e.g. 1h30m) after which a backend is considered idle
-# WATCHDOG_IDLE_TIMEOUT=5m
+# LOCALAI_WATCHDOG_IDLE_TIMEOUT=5m
+#
+# Enables watchdog to kill backends that are busy for too much time
+# LOCALAI_WATCHDOG_BUSY=true
 #
 # Time in duration format (e.g. 1h30m) after which a backend is considered busy
-# WATCHDOG_BUSY_TIMEOUT=5m
+# LOCALAI_WATCHDOG_BUSY_TIMEOUT=5m
--- a/.github/bump_docs.sh
+++ b/.github/bump_docs.sh
@@ -2,6 +2,6 @@
 set -xe
 REPO=$1

-LATEST_TAG=$(curl -s "https://api.github.com/repos/$REPO/releases/latest" | jq -r '.name')
+LATEST_TAG=$(curl -s "https://api.github.com/repos/$REPO/releases/latest" | jq -r '.tag_name')

 cat <<< $(jq ".version = \"$LATEST_TAG\"" docs/data/version.json) > docs/data/version.json
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,25 @@
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+version: 2
+updates:
+  - package-ecosystem: "gomod"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "github-actions"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -8,6 +8,11 @@ kind/documentation:
  - changed-files:
    - any-glob-to-any-file: '*.md'

+area/ai-model:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'gallery/*'
+
 examples:
 - any:
  - changed-files:
@@ -16,4 +21,4 @@ examples:
 ci:
 - any:
  - changed-files:
-    - any-glob-to-any-file: '.github/*'
+    - any-glob-to-any-file: '.github/*'
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -49,7 +49,7 @@ jobs:
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -17,7 +17,7 @@ jobs:
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -0,0 +1,43 @@
+name: Dependabot auto-merge
+on:
+- pull_request_target
+
+permissions:
+  contents: write
+  pull-requests: write
+  packages: read
+
+jobs:
+  dependabot:
+    runs-on: ubuntu-latest
+    if: ${{ github.actor == 'dependabot[bot]' }}
+    steps:
+      - name: Dependabot metadata
+        id: metadata
+        uses: dependabot/fetch-metadata@v2.1.0
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          skip-commit-verification: true
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Approve a PR if not already approved
+        run: |
+          gh pr checkout "$PR_URL"
+            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
+          then
+            gh pr review --approve "$PR_URL"
+          else
+            echo "PR already approved.";
+          fi
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+
+      - name: Enable auto-merge for Dependabot PRs
+        if: ${{ contains(github.event.pull_request.title, 'bump')}}
+        run: gh pr merge --auto --squash "$PR_URL"
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -0,0 +1,94 @@
+name: 'generate and publish GRPC docker caches'
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+concurrency:
+  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  generate_caches:
+    strategy:
+      matrix:
+        include:
+          - grpc-base-image: ubuntu:22.04
+            runs-on: 'ubuntu-latest'
+            platforms: 'linux/amd64'
+    runs-on: ${{matrix.runs-on}}
+    steps:
+      - name: Release space from worker
+        if: matrix.runs-on == 'ubuntu-latest'
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get remove -y microsoft-edge-stable || true
+          sudo apt-get remove -y firefox || true
+          sudo apt-get remove -y powershell || true
+          sudo apt-get remove -y r-base-core || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf "/usr/local/share/boost" || true
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+          df -h
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache GRPC
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          build-args: |
+            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.63.0
+          context: .
+          file: ./Dockerfile
+          cache-to: type=gha,ignore-error=true
+          cache-from: type=gha
+          target: grpc
+          platforms: ${{ matrix.platforms }}
+          push: false
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -22,6 +22,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
@@ -61,12 +62,14 @@ jobs:
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -85,6 +88,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
@@ -102,11 +106,12 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -122,4 +127,4 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -26,6 +26,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      aio: ${{ matrix.aio }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
@@ -129,6 +130,7 @@ jobs:
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
@@ -140,12 +142,14 @@ jobs:
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -157,7 +161,8 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -170,7 +175,8 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-core'
            ffmpeg: 'false'
            image-type: 'core'
@@ -179,7 +185,8 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-core'
            ffmpeg: 'false'
            image-type: 'core'
@@ -188,7 +195,8 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -197,7 +205,8 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -210,6 +219,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
@@ -219,6 +229,7 @@ jobs:
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  
@@ -236,6 +247,7 @@ jobs:
      runs-on: ${{ matrix.runs-on }}
      aio: ${{ matrix.aio }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
@@ -258,7 +270,7 @@ jobs:
            aio: "-aio-cpu"
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -269,7 +281,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -280,7 +292,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -291,7 +303,7 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -302,4 +314,4 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -6,6 +6,10 @@ on:
    inputs:
      base-image:
        description: 'Base image'
+        required: true
+        type: string
+      grpc-base-image:
+        description: 'GRPC Base image, must be a compatible image with base-image'
        required: false
        default: ''
        type: string
@@ -57,7 +61,7 @@ on:
      makeflags:
        description: 'Make Flags'
        required: false
-        default: '--jobs=3 --output-sync=target'
+        default: '--jobs=4 --output-sync=target'
        type: string
      aio:
        description: 'AIO Image Name'
@@ -197,29 +201,14 @@ jobs:
          username: ${{ secrets.quayUsername }}
          password: ${{ secrets.quayPassword }}

-      - name: Cache GRPC
-        uses: docker/build-push-action@v5
-        with:
-          builder: ${{ steps.buildx.outputs.name }}
-          build-args: |
-            IMAGE_TYPE=${{ inputs.image-type }}
-            BASE_IMAGE=${{ inputs.base-image }}
-            MAKEFLAGS=${{ inputs.makeflags }}
-            GRPC_VERSION=v1.58.0
-          context: .
-          file: ./Dockerfile
-          cache-from: type=gha
-          cache-to: type=gha,ignore-error=true
-          target: grpc
-          platforms: ${{ inputs.platforms }}
-          push: false
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-
      - name: Build and push
        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          # This is why some build args like GRPC_VERSION and MAKEFLAGS are hardcoded
          build-args: |
            BUILD_TYPE=${{ inputs.build-type }}
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
@@ -227,6 +216,9 @@ jobs:
            FFMPEG=${{ inputs.ffmpeg }}
            IMAGE_TYPE=${{ inputs.image-type }}
            BASE_IMAGE=${{ inputs.base-image }}
+            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.63.0
            MAKEFLAGS=${{ inputs.makeflags }}
          context: .
          file: ./Dockerfile
@@ -236,14 +228,6 @@ jobs:
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}

-      - name: Inspect image
-        if: github.event_name != 'pull_request'
-        run: |
-          docker pull localai/localai:${{ steps.meta.outputs.version }}
-          docker image inspect localai/localai:${{ steps.meta.outputs.version }}
-          docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
-          docker image inspect quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
-
      - name: Build and push AIO image
        if: inputs.aio != ''
        uses: docker/build-push-action@v5
@@ -280,17 +264,21 @@ jobs:
        run: |
          docker pull localai/localai:${{ steps.meta.outputs.version }}
          docker tag localai/localai:${{ steps.meta.outputs.version }} localai/localai:${{ inputs.latest-image }}
+          docker push localai/localai:${{ inputs.latest-image }}
          docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
          docker tag quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
+          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
      - name: Latest AIO tag
        # run this on branches, when it is a tag and there is a latest-image defined
        if: github.event_name != 'pull_request' && inputs.latest-image-aio != ''  && github.ref_type == 'tag'
        run: |
          docker pull localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }}
          docker tag localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }} localai/localai:${{ inputs.latest-image-aio }}
+          docker push localai/localai:${{ inputs.latest-image-aio }}
          docker pull quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }}
          docker tag quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
-
+          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
+  
      - name: job summary
        run: |
          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/localaibot_automerge.yml
+++ b/.github/workflows/localaibot_automerge.yml
@@ -0,0 +1,35 @@
+name: LocalAI-bot auto-merge
+on:
+- pull_request_target
+
+permissions:
+  contents: write
+  pull-requests: write
+  packages: read
+
+jobs:
+  dependabot:
+    runs-on: ubuntu-latest
+    if: ${{ github.actor == 'localai-bot' }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Approve a PR if not already approved
+        run: |
+          gh pr checkout "$PR_URL"
+            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
+          then
+            gh pr review --approve "$PR_URL"
+          else
+            echo "PR already approved.";
+          fi
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+
+      - name: Enable auto-merge for LocalAIBot PRs
+        run: gh pr merge --auto --squash "$PR_URL"
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,9 +1,11 @@
 name: Build and Release

-on: push
+on: 
+- push
+- pull_request

 env:
-  GRPC_VERSION: v1.58.0
+  GRPC_VERSION: v1.63.0

 permissions:
  contents: write
@@ -33,14 +35,14 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install build-essential ffmpeg protobuf-compiler
      - name: Install CUDA Dependencies
        if: ${{ matrix.build == 'cuda12' || matrix.build == 'cuda11' }}
        run: |
@@ -55,7 +57,7 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
      - name: Cache grpc
        id: cache-grpc
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        with:
          path: grpc
          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
@@ -75,6 +77,9 @@ jobs:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          export PATH=$PATH:$GOPATH/bin
          if [ "${{ matrix.build }}" == "cuda12" ] || [ "${{ matrix.build }}" == "cuda11" ]; then
            export BUILD_TYPE=cublas
            export PATH=/usr/local/cuda/bin:$PATH
@@ -82,12 +87,12 @@ jobs:
          else
            STATIC=true make dist
          fi
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.build }}
+          name: LocalAI-linux-${{ matrix.build }}
          path: release/
      - name: Release
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
@@ -100,27 +105,24 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
-          sudo apt-get install -y --no-install-recommends libopencv-dev
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build stablediffusion
        run: |
+          export PATH=$PATH:$GOPATH/bin
          make backend-assets/grpc/stablediffusion
          mkdir -p release && cp backend-assets/grpc/stablediffusion release
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
          name: stablediffusion
          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v1
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*

  build-macOS:
    strategy:
@@ -138,13 +140,15 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          brew install protobuf grpc
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build
        id: build
        env:
@@ -153,13 +157,61 @@ jobs:
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
+          export PATH=$PATH:$GOPATH/bin
          make dist
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.build }}
+          name: LocalAI-MacOS-${{ matrix.build }}
          path: release/
      - name: Release
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
+
+
+  build-macOS-arm64:
+    strategy:
+      matrix:
+        include:
+          - build: 'avx2'
+            defines: ''
+          - build: 'avx'
+            defines: '-DLLAMA_AVX2=OFF'
+          - build: 'avx512'
+            defines: '-DLLAMA_AVX512=ON'
+    runs-on: macos-14
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
+      - name: Dependencies
+        run: |
+          brew install protobuf grpc
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+      - name: Build
+        id: build
+        env:
+          CMAKE_ARGS: "${{ matrix.defines }}"
+          BUILD_ID: "${{ matrix.build }}"
+        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
+          export PATH=$PATH:$GOPATH/bin
+          make dist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: LocalAI-MacOS-arm64-${{ matrix.build }}
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -14,14 +14,17 @@ jobs:
      GO111MODULE: on
    steps:
      - name: Checkout Source
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
+        if: ${{ github.actor != 'dependabot[bot]' }}
        uses: securego/gosec@master
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
      - name: Upload SARIF file
-        uses: github/codeql-action/upload-sarif@v2
+        if: ${{ github.actor != 'dependabot[bot]' }}
+        uses: github/codeql-action/upload-sarif@v3
        with:
          # Path to SARIF file relative to the root of the repository
-          sarif_file: results.sarif
+          sarif_file: results.sarif
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -32,8 +32,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.63.0
          
          sudo rm -rfv /usr/bin/conda || true

@@ -61,8 +62,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.63.0
          
          sudo rm -rfv /usr/bin/conda || true

@@ -72,6 +74,37 @@ jobs:
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test

+
+  tests-rerankers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.63.0
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test rerankers
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make --jobs=5 --output-sync=target -C backend/python/rerankers
+           make --jobs=5 --output-sync=target -C backend/python/rerankers test
+
  tests-diffusers:
    runs-on: ubuntu-latest
    steps:
@@ -90,8 +123,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.63.0
          
          sudo rm -rfv /usr/bin/conda || true

@@ -101,6 +135,35 @@ jobs:
           make --jobs=5 --output-sync=target -C backend/python/diffusers
           make --jobs=5 --output-sync=target -C backend/python/diffusers test

+  tests-parler-tts:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.63.0
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test parler-tts
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts test

  tests-transformers-musicgen:
    runs-on: ubuntu-latest
@@ -120,8 +183,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.63.0
          
          sudo rm -rfv /usr/bin/conda || true

@@ -151,8 +215,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools==1.63.0
          
  #         sudo rm -rfv /usr/bin/conda || true

@@ -222,8 +287,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools==1.63.0
          
  #         sudo rm -rfv /usr/bin/conda || true

@@ -254,8 +320,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools==1.63.0
  #         sudo rm -rfv /usr/bin/conda || true
  #     - name: Test vllm
  #       run: |
@@ -280,8 +347,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev    
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.63.0
          sudo rm -rfv /usr/bin/conda || true
      - name: Test vall-e-x
        run: |
@@ -307,7 +375,8 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng          
+          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
+          pip install --user grpcio-tools==1.63.0
          sudo rm -rfv /usr/bin/conda || true

      - name: Test coqui
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ on:
      - '*'

 env:
-  GRPC_VERSION: v1.58.0
+  GRPC_VERSION: v1.63.0

 concurrency:
  group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -60,7 +60,7 @@ jobs:
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v4
+        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go-version }}
          cache: false
@@ -70,17 +70,27 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install build-essential curl ffmpeg
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
          sudo apt-get install -y libopencv-dev
-          
+
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+
+          # The python3-grpc-tools package in 22.04 is too old
+          pip install --user grpcio-tools
+
          sudo rm -rfv /usr/bin/conda || true
          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers

@@ -89,10 +99,10 @@ jobs:
          GO_TAGS="tts" make -C sources/go-piper piper.o && \
          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
-          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
      - name: Cache grpc
        id: cache-grpc
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        with:
          path: grpc
          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
@@ -108,11 +118,14 @@ jobs:
          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
-        timeout-minutes: 5
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true

  tests-aio-container:
    runs-on: ubuntu-latest
@@ -163,8 +176,11 @@ jobs:
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
-        timeout-minutes: 5
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true

  tests-apple:
    runs-on: macOS-14
@@ -177,7 +193,7 @@ jobs:
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v4
+        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go-version }}
          cache: false
@@ -186,7 +202,8 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc make
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
+          pip install --user grpcio-tools==1.63.0
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
@@ -196,5 +213,8 @@ jobs:
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
-        timeout-minutes: 5
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -0,0 +1,31 @@
+name: Update swagger
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  swagger:
+    strategy:
+      fail-fast: false
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version: 'stable'
+      - run: |
+          go install github.com/swaggo/swag/cmd/swag@latest
+      - name: Bump swagger 🔧
+        run: |
+          make swagger
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: 'feat(swagger): update swagger'
+          title: 'feat(swagger): update swagger'
+          branch: "update/swagger"
+          body:  Update swagger
+          signoff: true
+
--- a/.github/workflows/yaml-check.yml
+++ b/.github/workflows/yaml-check.yml
@@ -0,0 +1,18 @@
+name: 'Yamllint GitHub Actions'
+on:
+  - pull_request
+jobs:
+  yamllint:
+    name: 'Yamllint'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@master
+      - name: 'Yamllint'
+        uses: karancode/yamllint-github-action@master
+        with:
+          yamllint_file_or_dir: 'gallery'
+          yamllint_strict: false
+          yamllint_comment: true
+        env:
+          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,11 @@ backend-assets/*
 !backend-assets/.keep
 prepare
 /ggml-metal.metal
+
+# Protobuf generated files
+*.pb.go
+*pb2.py
+*pb2_grpc.py
+
+# SonarQube
+.scannerwork
--- a/.yamllint
+++ b/.yamllint
@@ -0,0 +1,4 @@
+extends: default
+
+rules:
+    line-length: disable
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-# Contributing to localAI
+# Contributing to LocalAI

 Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.

@@ -29,8 +29,9 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time

 1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
 2. Navigate to the project directory: `cd LocalAI`
-3. Install the required dependencies: `make prepare`
-4. Run LocalAI: `make run`
+3. Install the required dependencies ( see https://localai.io/basics/build/#build-localai-locally )
+4. Build LocalAI: `make build`
+5. Run LocalAI: `./local-ai`

 ## Contributing

@@ -59,14 +60,29 @@ If you find a bug, have a feature request, or encounter any issues, please check

 `make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.

+### Running AIO tests
+
+All-In-One images has a set of tests that automatically verifies that most of the endpoints works correctly, a flow can be :
+
+```bash
+# Build the LocalAI docker image
+make DOCKER_IMAGE=local-ai docker
+
+# Build the corresponding AIO image
+BASE_IMAGE=local-ai DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
+
+# Run the AIO e2e tests
+LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio make run-e2e-aio
+```
+
 ## Documentation

- We are welcome the contribution of the documents, please open new PR in the official document repo [localai-website](https://github.com/go-skynet/localai-website)
-
+We are welcome the contribution of the documents, please open new PR or create a new issue. The documentation is available under `docs/` https://github.com/mudler/LocalAI/tree/master/docs
+ 
 ## Community and Communication

 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)

---
+---
--- a/205
+++ b/205
@@ -1,30 +1,45 @@
 ARG IMAGE_TYPE=extras
 ARG BASE_IMAGE=ubuntu:22.04
+ARG GRPC_BASE_IMAGE=${BASE_IMAGE}

-# extras or core
-FROM ${BASE_IMAGE} as requirements-core
+# The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
+FROM ${BASE_IMAGE} AS requirements-core

 USER root

 ARG GO_VERSION=1.21.7
-ARG BUILD_TYPE
-ARG CUDA_MAJOR_VERSION=11
-ARG CUDA_MINOR_VERSION=7
 ARG TARGETARCH
 ARG TARGETVARIANT

-ENV BUILD_TYPE=${BUILD_TYPE}
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"

 ARG GO_TAGS="stablediffusion tinydream tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        python3-pip \
+        python-is-python3 \
+        unzip && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    pip install --upgrade pip

 # Install Go
-RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
-ENV PATH $PATH:/usr/local/go/bin
+RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
+ENV PATH $PATH:/root/go/bin:/usr/local/go/bin
+
+# Install grpc compilers
+RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@latest && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+
+# Install grpcio-tools (the version in 22.04 is too old)
+RUN pip install --user grpcio-tools

 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
@@ -33,16 +48,6 @@ RUN update-ca-certificates
 RUN echo "Target Architecture: $TARGETARCH"
 RUN echo "Target Variant: $TARGETVARIANT"

-# CuBLAS requirements
-RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
-    apt-get install -y software-properties-common && \
-    curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
-    dpkg -i cuda-keyring_1.1-1_all.deb && \
-    rm -f cuda-keyring_1.1-1_all.deb && \
-    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
-    ; fi
-
 # Cuda
 ENV PATH /usr/local/cuda/bin:${PATH}

@@ -50,10 +55,12 @@ ENV PATH /usr/local/cuda/bin:${PATH}
 ENV PATH /opt/rocm/bin:${PATH}

 # OpenBLAS requirements and stable diffusion
-RUN apt-get install -y \
-    libopenblas-dev \
-    libopencv-dev \ 
-    && apt-get clean
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libopenblas-dev \
+        libopencv-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

 # Set up OpenCV
 RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
@@ -66,56 +73,114 @@ RUN test -n "$TARGETARCH" \
 ###################################
 ###################################

-FROM requirements-core as requirements-extras
+# The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
+FROM requirements-core AS requirements-extras

-RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends gpg && \
+    curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
    apt-get update && \
-    apt-get install -y conda && apt-get clean
+    apt-get install -y --no-install-recommends \
+        conda && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

 ENV PATH="/root/.cargo/bin:${PATH}"
-RUN apt-get install -y python3-pip && apt-get clean
-RUN pip install --upgrade pip

 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN apt-get install -y espeak-ng espeak && apt-get clean
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        espeak-ng \
+        espeak && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

-RUN if [ ! -e /usr/bin/python ]; then \
-	  ln -s /usr/bin/python3 /usr/bin/python \
+###################################
+###################################
+
+# The requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
+# This target will be built on top of requirements-core or requirements-extras as retermined by the IMAGE_TYPE build-arg
+FROM requirements-${IMAGE_TYPE} AS requirements-drivers
+
+ARG BUILD_TYPE
+ARG CUDA_MAJOR_VERSION=11
+ARG CUDA_MINOR_VERSION=7
+
+ENV BUILD_TYPE=${BUILD_TYPE}
+
+# CuBLAS requirements
+RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common && \
+        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        rm -f cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi
+
+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            libclblast-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
    ; fi

 ###################################
 ###################################

-FROM ${BASE_IMAGE} as grpc
+# The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
+# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
+FROM ${GRPC_BASE_IMAGE} AS grpc

-ARG MAKEFLAGS
+# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
+ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG GRPC_VERSION=v1.58.0

-ENV MAKEFLAGS=${MAKEFLAGS}
+ENV MAKEFLAGS=${GRPC_MAKEFLAGS}

 WORKDIR /build

 RUN apt-get update && \
-    apt-get install -y g++ cmake git && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        build-essential \
+        cmake \
+        git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

-RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc
-
-RUN cd grpc && \
-    mkdir -p cmake/build && \
-    cd cmake/build && \
-    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF ../.. && \
-    make
+# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
+# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
+# and running make install in the target container
+RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    mkdir -p /build/grpc/cmake/build && \
+    cd /build/grpc/cmake/build && \
+    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
+    make && \
+    make install && \
+    rm -rf /build

 ###################################
 ###################################

-FROM requirements-${IMAGE_TYPE} as builder
+# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
+# Adjustments to the build process should likely be made here.
+FROM requirements-drivers AS builder

 ARG GO_TAGS="stablediffusion tts"
 ARG GRPC_BACKENDS
@@ -133,34 +198,37 @@ WORKDIR /build
 COPY . .
 COPY .git .
 RUN echo "GO_TAGS: $GO_TAGS"
+
 RUN make prepare

-# If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
-    apt-get update && \
-    apt-get install -y libclblast-dev && \
-    apt-get clean \
-    ; fi
+# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
+# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
+# here so that we can generate the grpc code for the stablediffusion build
+RUN curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+    unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+    rm protoc.zip

 # stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

-COPY --from=grpc /build/grpc ./grpc/
-
-RUN cd /build/grpc/cmake/build && make install
+# Install the pre-built GRPC
+COPY --from=grpc /opt/grpc /usr/local

 # Rebuild with defaults backends
+WORKDIR /build
 RUN make build

 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
-    mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
-    touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
+        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
+        touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
    ; fi

 ###################################
 ###################################

-FROM requirements-${IMAGE_TYPE}
+# This is the final target. The result of this target will be the image uploaded to the registry.
+# If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
+FROM requirements-drivers

 ARG FFMPEG
 ARG BUILD_TYPE
@@ -181,14 +249,11 @@ ENV PIP_CACHE_PURGE=true

 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
-    apt-get install -y ffmpeg && apt-get clean \
-    ; fi
-
-# Add OpenCL
-RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
-    apt-get update && \
-    apt-get install -y libclblast1 && \
-    apt-get clean \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            ffmpeg && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
    ; fi

 WORKDIR /build
@@ -200,9 +265,9 @@ WORKDIR /build
 COPY . .

 COPY --from=builder /build/sources ./sources/
-COPY --from=grpc /build/grpc ./grpc/
+COPY --from=grpc /opt/grpc /usr/local

-RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf grpc
+RUN make prepare-sources

 # Copy the binary
 COPY --from=builder /build/local-ai ./
@@ -232,6 +297,9 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/sentencetransformers \
    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+    make -C backend/python/rerankers \
+    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/transformers \
    ; fi
@@ -250,6 +318,9 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/transformers-musicgen \
    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+    make -C backend/python/parler-tts \
+    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/coqui \
    ; fi
@@ -259,7 +330,7 @@ RUN mkdir -p /build/models

 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
-  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
+  CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1
  
 VOLUME /build/models
 EXPOSE 8080
--- a/240
+++ b/240
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai

 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=cc4a95426d17417d3c83f12bdb514fbe8abe2a88
+CPPLLAMA_VERSION?=6ecf3189e00a1e8e737a78b6d10e1d7006e050a2

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -16,7 +16,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=13c22321d1ac758ce68a429c23104e234b440769
+WHISPER_CPP_VERSION?=8fac6455ffeb0a0950a84e790ddb74f7290d33c4

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
@@ -25,10 +25,10 @@ BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
 PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759

 # stablediffusion version
-STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485
+STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f

 # tinydream version
-TINYDREAM_VERSION?=22a12a4bc0ac5455856f28f3b771331a551a4293
+TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057

 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
@@ -99,7 +99,7 @@ endif
 ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
-	export WHISPER_CUBLAS=1
+	export WHISPER_CUDA=1
 	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda
 endif

@@ -179,20 +179,20 @@ endif
 all: help

 ## BERT embeddings
-sources/go-bert:
-	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
-	cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-bert.cpp:
+	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert.cpp
+	cd sources/go-bert.cpp && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-bert/libgobert.a: sources/go-bert
-	$(MAKE) -C sources/go-bert libgobert.a
+sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
+	$(MAKE) -C sources/go-bert.cpp libgobert.a

-## go-llama-ggml
-sources/go-llama-ggml:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
-	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
+## go-llama.cpp
+sources/go-llama.cpp:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama.cpp
+	cd sources/go-llama.cpp && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
-	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
+sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
+	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

 ## go-piper
 sources/go-piper:
@@ -211,12 +211,12 @@ sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## RWKV
-sources/go-rwkv:
-	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv
-	cd sources/go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-rwkv.cpp:
+	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv.cpp
+	cd sources/go-rwkv.cpp && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-rwkv/librwkv.a: sources/go-rwkv
-	cd sources/go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
+sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
+	cd sources/go-rwkv.cpp && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

 ## stable diffusion
 sources/go-stable-diffusion:
@@ -236,23 +236,24 @@ sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream

 ## whisper
 sources/whisper.cpp:
-	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
+	git clone https://github.com/ggerganov/whisper.cpp sources/whisper.cpp
 	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1

 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
-	cd sources/whisper.cpp && make libwhisper.a
+	cd sources/whisper.cpp && $(MAKE) libwhisper.a

-get-sources: sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
+get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream

 replace:
-	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp

 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
@@ -271,12 +272,12 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C sources/go-llama-ggml clean
+	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
-	$(MAKE) -C sources/go-rwkv clean
+	$(MAKE) -C sources/go-rwkv.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
-	$(MAKE) -C sources/go-bert clean
+	$(MAKE) -C sources/go-bert.cpp clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build
@@ -289,10 +290,12 @@ clean: ## Remove build related file
 	rm -rf ./sources
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
-	rm -rf backend-assets
+	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/cpp/llama clean
 	$(MAKE) dropreplace
+	$(MAKE) protogen-clean
+	rmdir pkg/grpc/proto || true

 clean-tests:
 	rm -rf test-models
@@ -416,30 +419,152 @@ help: ## Show this help.
 		else if (/^## .*$$/) {printf "  ${CYAN}%s${RESET}\n", substr($$1,4)} \
 		}' $(MAKEFILE_LIST)

+.PHONY: protogen
 protogen: protogen-go protogen-python

+.PHONY: protogen-clean
+protogen-clean: protogen-go-clean protogen-python-clean
+
+.PHONY: protogen-go
 protogen-go:
+	mkdir -p pkg/grpc/proto
 	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto

-protogen-python:
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/sentencetransformers/ --grpc_python_out=backend/python/sentencetransformers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers/ --grpc_python_out=backend/python/transformers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers-musicgen/ --grpc_python_out=backend/python/transformers-musicgen/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/autogptq/ --grpc_python_out=backend/python/autogptq/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama/ --grpc_python_out=backend/python/exllama/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/bark/ --grpc_python_out=backend/python/bark/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/diffusers/ --grpc_python_out=backend/python/diffusers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/coqui/ --grpc_python_out=backend/python/coqui/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/mamba/ --grpc_python_out=backend/python/mamba/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama2/ --grpc_python_out=backend/python/exllama2/ backend/backend.proto
+.PHONY: protogen-go-clean
+protogen-go-clean:
+	$(RM) pkg/grpc/proto/backend.pb.go pkg/grpc/proto/backend_grpc.pb.go
+	$(RM) bin/*
+
+.PHONY: protogen-python
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen
+
+.PHONY: protogen-python-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean
+
+.PHONY: autogptq-protogen
+autogptq-protogen:
+	$(MAKE) -C backend/python/autogptq protogen
+
+.PHONY: autogptq-protogen-clean
+autogptq-protogen-clean:
+	$(MAKE) -C backend/python/autogptq protogen-clean
+
+.PHONY: bark-protogen
+bark-protogen:
+	$(MAKE) -C backend/python/bark protogen
+
+.PHONY: bark-protogen-clean
+bark-protogen-clean:
+	$(MAKE) -C backend/python/bark protogen-clean
+
+.PHONY: coqui-protogen
+coqui-protogen:
+	$(MAKE) -C backend/python/coqui protogen
+
+.PHONY: coqui-protogen-clean
+coqui-protogen-clean:
+	$(MAKE) -C backend/python/coqui protogen-clean
+
+.PHONY: diffusers-protogen
+diffusers-protogen:
+	$(MAKE) -C backend/python/diffusers protogen
+
+.PHONY: diffusers-protogen-clean
+diffusers-protogen-clean:
+	$(MAKE) -C backend/python/diffusers protogen-clean
+
+.PHONY: exllama-protogen
+exllama-protogen:
+	$(MAKE) -C backend/python/exllama protogen
+
+.PHONY: exllama-protogen-clean
+exllama-protogen-clean:
+	$(MAKE) -C backend/python/exllama protogen-clean
+
+.PHONY: exllama2-protogen
+exllama2-protogen:
+	$(MAKE) -C backend/python/exllama2 protogen
+
+.PHONY: exllama2-protogen-clean
+exllama2-protogen-clean:
+	$(MAKE) -C backend/python/exllama2 protogen-clean
+
+.PHONY: mamba-protogen
+mamba-protogen:
+	$(MAKE) -C backend/python/mamba protogen
+
+.PHONY: mamba-protogen-clean
+mamba-protogen-clean:
+	$(MAKE) -C backend/python/mamba protogen-clean
+
+.PHONY: petals-protogen
+petals-protogen:
+	$(MAKE) -C backend/python/petals protogen
+
+.PHONY: petals-protogen-clean
+petals-protogen-clean:
+	$(MAKE) -C backend/python/petals protogen-clean
+
+.PHONY: rerankers-protogen
+rerankers-protogen:
+	$(MAKE) -C backend/python/rerankers protogen
+
+.PHONY: rerankers-protogen-clean
+rerankers-protogen-clean:
+	$(MAKE) -C backend/python/rerankers protogen-clean
+
+.PHONY: sentencetransformers-protogen
+sentencetransformers-protogen:
+	$(MAKE) -C backend/python/sentencetransformers protogen
+
+.PHONY: sentencetransformers-protogen-clean
+sentencetransformers-protogen-clean:
+	$(MAKE) -C backend/python/sentencetransformers protogen-clean
+
+.PHONY: transformers-protogen
+transformers-protogen:
+	$(MAKE) -C backend/python/transformers protogen
+
+.PHONY: transformers-protogen-clean
+transformers-protogen-clean:
+	$(MAKE) -C backend/python/transformers protogen-clean
+
+.PHONY: parler-tts-protogen
+parler-tts-protogen:
+	$(MAKE) -C backend/python/parler-tts protogen
+
+.PHONY: parler-tts-protogen-clean
+parler-tts-protogen-clean:
+	$(MAKE) -C backend/python/parler-tts protogen-clean
+
+.PHONY: transformers-musicgen-protogen
+transformers-musicgen-protogen:
+	$(MAKE) -C backend/python/transformers-musicgen protogen
+
+.PHONY: transformers-musicgen-protogen-clean
+transformers-musicgen-protogen-clean:
+	$(MAKE) -C backend/python/transformers-musicgen protogen-clean
+
+.PHONY: vall-e-x-protogen
+vall-e-x-protogen:
+	$(MAKE) -C backend/python/vall-e-x protogen
+
+.PHONY: vall-e-x-protogen-clean
+vall-e-x-protogen-clean:
+	$(MAKE) -C backend/python/vall-e-x protogen-clean
+
+.PHONY: vllm-protogen
+vllm-protogen:
+	$(MAKE) -C backend/python/vllm protogen
+
+.PHONY: vllm-protogen-clean
+vllm-protogen-clean:
+	$(MAKE) -C backend/python/vllm protogen-clean

 ## GRPC
 # Note: it is duplicated in the Dockerfile
-prepare-extra-conda-environments:
+prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
@@ -447,14 +572,16 @@ prepare-extra-conda-environments:
 	$(MAKE) -C backend/python/vllm
 	$(MAKE) -C backend/python/mamba
 	$(MAKE) -C backend/python/sentencetransformers
+	$(MAKE) -C backend/python/rerankers
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/transformers-musicgen
+	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/petals
 	$(MAKE) -C backend/python/exllama2

-prepare-test-extra:
+prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers

@@ -478,11 +605,11 @@ backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true

-backend-assets/grpc: replace
+backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/bert-embeddings: sources/go-bert sources/go-bert/libgobert.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
+backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/

 backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
@@ -524,17 +651,16 @@ ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif

-backend-assets/grpc/llama-ggml: sources/go-llama-ggml sources/go-llama-ggml/libbinding.a backend-assets/grpc
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
+backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/

 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/

-backend-assets/grpc/rwkv: sources/go-rwkv sources/go-rwkv/librwkv.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
+backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv

 backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
@@ -581,7 +707,7 @@ docker-aio-all:

 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -589,7 +715,7 @@ docker-image-intel:

 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -597,4 +723,4 @@ docker-image-intel-xpu:

 .PHONY: swagger
 swagger:
-	swag init -g core/http/api.go --output swagger
+	swag init -g core/http/app.go --output swagger
--- a/README.md
+++ b/README.md
@@ -44,25 +44,23 @@

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

 ## 🔥🔥 Hot topics / Roadmap

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

- Landing page: https://github.com/mudler/LocalAI/pull/1922
+- Reranker API: https://github.com/mudler/LocalAI/pull/2121
+- Gallery WebUI: https://github.com/mudler/LocalAI/pull/2104
+- llama3: https://github.com/mudler/LocalAI/discussions/2076
+- Parler-TTS: https://github.com/mudler/LocalAI/pull/2027
 - Openvino support: https://github.com/mudler/LocalAI/pull/1892
 - Vector store: https://github.com/mudler/LocalAI/pull/1795
 - All-in-one container image: https://github.com/mudler/LocalAI/issues/1855
- Parallel function calling: https://github.com/mudler/LocalAI/pull/1726 / Tools API support: https://github.com/mudler/LocalAI/pull/1715
- Upload file API: https://github.com/mudler/LocalAI/pull/1703
- ROCm container images: https://github.com/mudler/LocalAI/pull/1595 / Intel GPU support (sycl, transformers, diffusers): https://github.com/mudler/LocalAI/issues/1653
- Mamba support: https://github.com/mudler/LocalAI/pull/1589
- Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
- 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
- Img2vid https://github.com/mudler/LocalAI/pull/1442

 Hot topics (looking for contributors):
+
+- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
 - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
 - Assistant API: https://github.com/mudler/LocalAI/issues/1273
@@ -93,7 +91,8 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
 - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
- 🆕 [Vision API](https://localai.io/features/gpt-vision/)
+- 🥽 [Vision API](https://localai.io/features/gpt-vision/)
+- 🆕 [Reranker API](https://localai.io/features/reranker/)

 ## 💻 Usage

--- a/aio/cpu/rerank.yaml
+++ b/aio/cpu/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,20 +1,27 @@
 name: gpt-4
 mmap: true
 parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q2_K.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf

 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
-    {{- if .Content}}
-    {{.Content}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
-    {{- if .FunctionCall }}</tool_call>{{end }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
-    <|im_end|>
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -29,8 +36,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
-    <|im_end|>
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -129,7 +129,7 @@ detect_gpu
 detect_gpu_size

 PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
-export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"

 check_vars

--- a/aio/gpu-8g/rerank.yaml
+++ b/aio/gpu-8g/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -1,20 +1,27 @@
 name: gpt-4
 mmap: true
 parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf

 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
-    {{- if .Content}}
-    {{.Content}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
-    {{- if .FunctionCall }}</tool_call>{{end }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
-    <|im_end|>
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -29,8 +36,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
-    <|im_end|>
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/aio/intel/rerank.yaml
+++ b/aio/intel/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -2,20 +2,27 @@ name: gpt-4
 mmap: false
 f16: false
 parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf

 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
-    {{- if .Content}}
-    {{.Content}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
-    {{- if .FunctionCall }}</tool_call>{{end }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
-    <|im_end|>
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -30,8 +37,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
-    <|im_end|>
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -23,6 +23,30 @@ service Backend {
  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
  rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
+
+  rpc Rerank(RerankRequest) returns (RerankResult) {}
+}
+
+message RerankRequest {
+  string query = 1;
+  repeated string documents = 2;
+  int32 top_n = 3;
+}
+
+message RerankResult {
+  Usage usage = 1;
+  repeated DocumentResult results = 2;
+}
+
+message Usage {
+  int32 total_tokens = 1;
+  int32 prompt_tokens = 2;
+}
+
+message DocumentResult {
+  int32 index = 1;
+  string text = 2;
+  float relevance_score = 3;
 }

 message StoresKey {
@@ -107,11 +131,15 @@ message PredictOptions {
  string NegativePrompt = 40;
  int32 NDraft = 41;
  repeated string Images = 42;
+  bool UseTokenizerTemplate = 43;
+  repeated Message Messages = 44;
 }

 // The response message containing the result
 message Reply {
  bytes message = 1;
+  int32 tokens = 2;
+  int32 prompt_tokens = 3;
 }

 message ModelOptions {
@@ -173,6 +201,7 @@ message ModelOptions {
  bool   EnforceEager = 52;
  int32  SwapSpace = 53;
  int32  MaxModelLen = 54;
+  int32  TensorParallelSize = 55;

  string MMProj = 41;

@@ -256,3 +285,8 @@ message StatusResponse {
  State state = 1;
  MemoryUsageData memory = 2;
 }
+
+message Message {
+  string role = 1;
+  string content = 2;
+}
--- a/backend/backend_grpc.pb.go
+++ b/backend/backend_grpc.pb.go
@@ -1,457 +0,0 @@
-// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
-// versions:
-// - protoc-gen-go-grpc v1.2.0
-// - protoc             v4.23.4
-// source: backend/backend.proto
-
-package proto
-
-import (
-	context "context"
-	grpc "google.golang.org/grpc"
-	codes "google.golang.org/grpc/codes"
-	status "google.golang.org/grpc/status"
-)
-
-// This is a compile-time assertion to ensure that this generated file
-// is compatible with the grpc package it is being compiled against.
-// Requires gRPC-Go v1.32.0 or later.
-const _ = grpc.SupportPackageIsVersion7
-
-// BackendClient is the client API for Backend service.
-//
-// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
-type BackendClient interface {
-	Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error)
-	Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error)
-	LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error)
-	PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error)
-	Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error)
-	GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error)
-	AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error)
-	TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error)
-	TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error)
-	Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error)
-}
-
-type backendClient struct {
-	cc grpc.ClientConnInterface
-}
-
-func NewBackendClient(cc grpc.ClientConnInterface) BackendClient {
-	return &backendClient{cc}
-}
-
-func (c *backendClient) Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error) {
-	out := new(Reply)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Health", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error) {
-	out := new(Reply)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Predict", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/LoadModel", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error) {
-	stream, err := c.cc.NewStream(ctx, &Backend_ServiceDesc.Streams[0], "/backend.Backend/PredictStream", opts...)
-	if err != nil {
-		return nil, err
-	}
-	x := &backendPredictStreamClient{stream}
-	if err := x.ClientStream.SendMsg(in); err != nil {
-		return nil, err
-	}
-	if err := x.ClientStream.CloseSend(); err != nil {
-		return nil, err
-	}
-	return x, nil
-}
-
-type Backend_PredictStreamClient interface {
-	Recv() (*Reply, error)
-	grpc.ClientStream
-}
-
-type backendPredictStreamClient struct {
-	grpc.ClientStream
-}
-
-func (x *backendPredictStreamClient) Recv() (*Reply, error) {
-	m := new(Reply)
-	if err := x.ClientStream.RecvMsg(m); err != nil {
-		return nil, err
-	}
-	return m, nil
-}
-
-func (c *backendClient) Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error) {
-	out := new(EmbeddingResult)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Embedding", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/GenerateImage", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error) {
-	out := new(TranscriptResult)
-	err := c.cc.Invoke(ctx, "/backend.Backend/AudioTranscription", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/TTS", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error) {
-	out := new(TokenizationResponse)
-	err := c.cc.Invoke(ctx, "/backend.Backend/TokenizeString", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error) {
-	out := new(StatusResponse)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Status", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-// BackendServer is the server API for Backend service.
-// All implementations must embed UnimplementedBackendServer
-// for forward compatibility
-type BackendServer interface {
-	Health(context.Context, *HealthMessage) (*Reply, error)
-	Predict(context.Context, *PredictOptions) (*Reply, error)
-	LoadModel(context.Context, *ModelOptions) (*Result, error)
-	PredictStream(*PredictOptions, Backend_PredictStreamServer) error
-	Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error)
-	GenerateImage(context.Context, *GenerateImageRequest) (*Result, error)
-	AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error)
-	TTS(context.Context, *TTSRequest) (*Result, error)
-	TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error)
-	Status(context.Context, *HealthMessage) (*StatusResponse, error)
-	mustEmbedUnimplementedBackendServer()
-}
-
-// UnimplementedBackendServer must be embedded to have forward compatible implementations.
-type UnimplementedBackendServer struct {
-}
-
-func (UnimplementedBackendServer) Health(context.Context, *HealthMessage) (*Reply, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Health not implemented")
-}
-func (UnimplementedBackendServer) Predict(context.Context, *PredictOptions) (*Reply, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Predict not implemented")
-}
-func (UnimplementedBackendServer) LoadModel(context.Context, *ModelOptions) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method LoadModel not implemented")
-}
-func (UnimplementedBackendServer) PredictStream(*PredictOptions, Backend_PredictStreamServer) error {
-	return status.Errorf(codes.Unimplemented, "method PredictStream not implemented")
-}
-func (UnimplementedBackendServer) Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Embedding not implemented")
-}
-func (UnimplementedBackendServer) GenerateImage(context.Context, *GenerateImageRequest) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method GenerateImage not implemented")
-}
-func (UnimplementedBackendServer) AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method AudioTranscription not implemented")
-}
-func (UnimplementedBackendServer) TTS(context.Context, *TTSRequest) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method TTS not implemented")
-}
-func (UnimplementedBackendServer) TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method TokenizeString not implemented")
-}
-func (UnimplementedBackendServer) Status(context.Context, *HealthMessage) (*StatusResponse, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Status not implemented")
-}
-func (UnimplementedBackendServer) mustEmbedUnimplementedBackendServer() {}
-
-// UnsafeBackendServer may be embedded to opt out of forward compatibility for this service.
-// Use of this interface is not recommended, as added methods to BackendServer will
-// result in compilation errors.
-type UnsafeBackendServer interface {
-	mustEmbedUnimplementedBackendServer()
-}
-
-func RegisterBackendServer(s grpc.ServiceRegistrar, srv BackendServer) {
-	s.RegisterService(&Backend_ServiceDesc, srv)
-}
-
-func _Backend_Health_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(HealthMessage)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Health(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Health",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Health(ctx, req.(*HealthMessage))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_Predict_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Predict(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Predict",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Predict(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_LoadModel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(ModelOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).LoadModel(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/LoadModel",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).LoadModel(ctx, req.(*ModelOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_PredictStream_Handler(srv interface{}, stream grpc.ServerStream) error {
-	m := new(PredictOptions)
-	if err := stream.RecvMsg(m); err != nil {
-		return err
-	}
-	return srv.(BackendServer).PredictStream(m, &backendPredictStreamServer{stream})
-}
-
-type Backend_PredictStreamServer interface {
-	Send(*Reply) error
-	grpc.ServerStream
-}
-
-type backendPredictStreamServer struct {
-	grpc.ServerStream
-}
-
-func (x *backendPredictStreamServer) Send(m *Reply) error {
-	return x.ServerStream.SendMsg(m)
-}
-
-func _Backend_Embedding_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Embedding(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Embedding",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Embedding(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_GenerateImage_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(GenerateImageRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).GenerateImage(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/GenerateImage",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).GenerateImage(ctx, req.(*GenerateImageRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_AudioTranscription_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(TranscriptRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).AudioTranscription(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/AudioTranscription",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).AudioTranscription(ctx, req.(*TranscriptRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_TTS_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(TTSRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).TTS(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/TTS",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).TTS(ctx, req.(*TTSRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_TokenizeString_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).TokenizeString(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/TokenizeString",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).TokenizeString(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_Status_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(HealthMessage)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Status(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Status",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Status(ctx, req.(*HealthMessage))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-// Backend_ServiceDesc is the grpc.ServiceDesc for Backend service.
-// It's only intended for direct use with grpc.RegisterService,
-// and not to be introspected or modified (even as a copy)
-var Backend_ServiceDesc = grpc.ServiceDesc{
-	ServiceName: "backend.Backend",
-	HandlerType: (*BackendServer)(nil),
-	Methods: []grpc.MethodDesc{
-		{
-			MethodName: "Health",
-			Handler:    _Backend_Health_Handler,
-		},
-		{
-			MethodName: "Predict",
-			Handler:    _Backend_Predict_Handler,
-		},
-		{
-			MethodName: "LoadModel",
-			Handler:    _Backend_LoadModel_Handler,
-		},
-		{
-			MethodName: "Embedding",
-			Handler:    _Backend_Embedding_Handler,
-		},
-		{
-			MethodName: "GenerateImage",
-			Handler:    _Backend_GenerateImage_Handler,
-		},
-		{
-			MethodName: "AudioTranscription",
-			Handler:    _Backend_AudioTranscription_Handler,
-		},
-		{
-			MethodName: "TTS",
-			Handler:    _Backend_TTS_Handler,
-		},
-		{
-			MethodName: "TokenizeString",
-			Handler:    _Backend_TokenizeString_Handler,
-		},
-		{
-			MethodName: "Status",
-			Handler:    _Backend_Status_Handler,
-		},
-	},
-	Streams: []grpc.StreamDesc{
-		{
-			StreamName:    "PredictStream",
-			Handler:       _Backend_PredictStream_Handler,
-			ServerStreams: true,
-		},
-	},
-	Metadata: "backend/backend.proto",
-}
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -5,7 +5,6 @@ SYSTEM ?= $(HOST_SYSTEM)
 TAG_LIB_GRPC?=v1.59.0
 GIT_REPO_LIB_GRPC?=https://github.com/grpc/grpc.git
 GIT_CLONE_DEPTH?=1
-NUM_BUILD_THREADS?=$(shell nproc --ignore=1)

 INSTALLED_PACKAGES=installed_packages
 GRPC_REPO=grpc_repo
@@ -52,7 +51,7 @@ $(GRPC_REPO):

 $(GRPC_BUILD): $(GRPC_REPO)
 	mkdir -p $(GRPC_BUILD)
-	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . -- -j ${NUM_BUILD_THREADS} && cmake --build . --target install -- -j ${NUM_BUILD_THREADS}
+	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . && cmake --build . --target install

 build: $(INSTALLED_PACKAGES)

--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -2332,6 +2332,10 @@ public:
                std::string completion_text = result.result_json.value("content", "");

                reply.set_message(completion_text);
+                int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
+                reply.set_tokens(tokens_predicted);
+                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
+                reply.set_prompt_tokens(tokens_evaluated);

                // Send the reply
                writer->Write(reply);
@@ -2357,6 +2361,10 @@ public:
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
            completion_text = result.result_json.value("content", "");
+            int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
+            int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
+            reply->set_prompt_tokens(tokens_evaluated);
+            reply->set_tokens(tokens_predicted);
            reply->set_message(completion_text);
        }
        else
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -11,8 +11,8 @@ import (
 	"github.com/go-skynet/LocalAI/core/schema"
 )

-func runCommand(command []string) (string, error) {
-	cmd := exec.Command(command[0], command[1:]...)
+func ffmpegCommand(args []string) (string, error) {
+	cmd := exec.Command("ffmpeg", args...) // Constrain this to ffmpeg to permit security scanner to see that the command is safe.
 	cmd.Env = os.Environ()
 	out, err := cmd.CombinedOutput()
 	return string(out), err
@@ -21,16 +21,16 @@ func runCommand(command []string) (string, error) {
 // AudioToWav converts audio to wav for transcribe.
 // TODO: use https://github.com/mccoyst/ogg?
 func audioToWav(src, dst string) error {
-    command := []string{"ffmpeg", "-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
-	out, err := runCommand(command)
+	commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
+	out, err := ffmpegCommand(commandArgs)
 	if err != nil {
 		return fmt.Errorf("error: %w out: %s", err, out)
 	}
 	return nil
 }

-func Transcript(model whisper.Model, audiopath, language string, threads uint) (schema.Result, error) {
-	res := schema.Result{}
+func Transcript(model whisper.Model, audiopath, language string, threads uint) (schema.TranscriptionResult, error) {
+	res := schema.TranscriptionResult{}

 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
--- a/backend/go/transcribe/whisper.go
+++ b/backend/go/transcribe/whisper.go
@@ -21,6 +21,6 @@ func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.Result, error) {
+func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.TranscriptionResult, error) {
 	return Transcript(sd.whisper, opts.Dst, opts.Language, uint(opts.Threads))
 }
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,4 +1,13 @@
 .PHONY: autogptq
-autogptq:
+autogptq: protogen
 	$(MAKE) -C ../common-env/transformers

+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/autogptq/autogptq.py
+++ b/backend/python/autogptq/autogptq.py
@@ -39,7 +39,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                self.model_name = "Qwen-VL-Chat"
                model = AutoModelForCausalLM.from_pretrained(model_path, 
                    trust_remote_code=request.TrustRemoteCode,
-                    use_triton=request.UseTriton,
                    device_map="auto").eval()
            else:
                model = AutoGPTQForCausalLM.from_quantized(model_path,
--- a/backend/python/autogptq/autogptq.yml
+++ b/backend/python/autogptq/autogptq.yml
@@ -41,7 +41,7 @@ dependencies:
      - filelock==3.12.4
      - frozenlist==1.4.0
      - fsspec==2023.6.0
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub==0.16.4
      - idna==3.4
      - jinja2==3.1.2
--- a/backend/python/autogptq/backend_pb2.py
+++ b/backend/python/autogptq/backend_pb2.py
--- a/backend/python/autogptq/backend_pb2_grpc.py
+++ b/backend/python/autogptq/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/bark/Makefile
+++ b/backend/python/bark/Makefile
@@ -1,15 +1,25 @@
 .PHONY: ttsbark
-ttsbark:
+ttsbark: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running bark..."
 	bash run.sh
 	@echo "bark run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing bark..."
 	bash test.sh
 	@echo "bark tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/bark/backend_pb2.py
+++ b/backend/python/bark/backend_pb2.py
--- a/backend/python/bark/backend_pb2_grpc.py
+++ b/backend/python/bark/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -2,6 +2,7 @@
 set -ex

 SKIP_CONDA=${SKIP_CONDA:-0}
+REQUIREMENTS_FILE=$1

 # Check if environment exist
 conda_env_exists(){
@@ -14,7 +15,7 @@ else
    export PATH=$PATH:/opt/conda/bin
    if conda_env_exists "transformers" ; then
        echo "Creating virtual environment..."
-        conda env create --name transformers --file $1
+        conda env create --name transformers --file $REQUIREMENTS_FILE
        echo "Virtual environment created."
    else 
        echo "Virtual environment already exists."
@@ -25,14 +26,19 @@ if [ -d "/opt/intel" ]; then
    # Intel GPU: If the directory exists, we assume we are using the intel image
    # (no conda env)
    # https://github.com/intel/intel-extension-for-pytorch/issues/538
-    pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed optimum[openvino]
+    pip install torch==2.1.0.post0 torchvision==0.16.0.post0 torchaudio==2.1.0.post0 intel-extension-for-pytorch==2.1.20+xpu oneccl_bind_pt==2.1.200+xpu intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed optimum[openvino] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+fi
+
+# If we didn't skip conda, activate the environment
+# to install FlashAttention
+if [ $SKIP_CONDA -eq 0 ]; then
+    source activate transformers
+fi
+if [[ $REQUIREMENTS_FILE =~ -nvidia.yml$ ]]; then
+    #TODO: FlashAttention is supported on nvidia and ROCm, but ROCm install can't be done this easily
+    pip install flash-attn --no-build-isolation
 fi

 if [ "$PIP_CACHE_PURGE" = true ] ; then
-    if [ $SKIP_CONDA -eq 0 ]; then
-        # Activate conda environment
-        source activate transformers
-    fi
-
    pip cache purge
 fi
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -47,7 +47,7 @@ dependencies:
      - frozenlist==1.4.0
      - fsspec==2023.6.0
      - funcy==2.0
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub
      - idna==3.4
      - jinja2==3.1.2
@@ -116,8 +116,10 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1  
+      - rerankers[transformers]
+      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers-rocm.yml
+++ b/backend/python/common-env/transformers/transformers-rocm.yml
@@ -48,7 +48,7 @@ dependencies:
      - frozenlist==1.4.0
      - fsspec==2023.6.0
      - funcy==2.0
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub
      - idna==3.4
      - jinja2==3.1.2
@@ -104,8 +104,10 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1
+      - rerankers[transformers]
+      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers.yml
+++ b/backend/python/common-env/transformers/transformers.yml
@@ -47,7 +47,7 @@ dependencies:
      - frozenlist==1.4.0
      - fsspec==2023.6.0
      - funcy==2.0
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub
      - humanfriendly==10.0
      - idna==3.4
@@ -60,9 +60,10 @@ dependencies:
      - networkx
      - numpy==1.26.0
      - onnx==1.15.0
-      - openvino==2024.0.0
-      - openvino-telemetry==2023.2.1
-      - optimum[openvino]==1.17.1
+      - openvino==2024.1.0
+      - openvino-telemetry==2024.1.0
+      - optimum[openvino]==1.19.1
+      - optimum-intel==1.16.1
      - packaging==23.2
      - pandas
      - peft==0.5.0
@@ -108,8 +109,10 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
-      - xformers==0.0.23.post1  
+      - xformers==0.0.23.post1
+      - rerankers[transformers]
+      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/coqui/Makefile
+++ b/backend/python/coqui/Makefile
@@ -1,15 +1,25 @@
 .PHONY: coqui
-coqui:
+coqui: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running coqui..."
 	bash run.sh
 	@echo "coqui run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing coqui..."
 	bash test.sh
 	@echo "coqui tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/coqui/backend_pb2.py
+++ b/backend/python/coqui/backend_pb2.py
--- a/backend/python/coqui/backend_pb2_grpc.py
+++ b/backend/python/coqui/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -12,15 +12,25 @@ export SKIP_CONDA=1
 endif

 .PHONY: diffusers
-diffusers:
+diffusers: protogen
 	@echo "Installing $(CONDA_ENV_PATH)..."
 	bash install.sh $(CONDA_ENV_PATH)

 .PHONY: run
-run:
+run: protogen
 	@echo "Running diffusers..."
 	bash run.sh
 	@echo "Diffusers run."

-test:
+test: protogen
 	bash test.sh
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/diffusers/backend_pb2.py
+++ b/backend/python/diffusers/backend_pb2.py
--- a/backend/python/diffusers/backend_pb2_grpc.py
+++ b/backend/python/diffusers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/diffusers/diffusers-rocm.yml
+++ b/backend/python/diffusers/diffusers-rocm.yml
@@ -34,7 +34,7 @@ dependencies:
      - diffusers==0.24.0
      - filelock==3.12.4
      - fsspec==2023.9.2
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub>=0.19.4
      - idna==3.4
      - importlib-metadata==6.8.0
@@ -61,4 +61,5 @@ dependencies:
      - urllib3==2.0.6
      - zipp==3.17.0
      - torch
+      - opencv-python
 prefix: /opt/conda/envs/diffusers
--- a/backend/python/diffusers/diffusers.yml
+++ b/backend/python/diffusers/diffusers.yml
@@ -32,7 +32,7 @@ dependencies:
      - diffusers==0.24.0
      - filelock==3.12.4
      - fsspec==2023.9.2
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub>=0.19.4
      - idna==3.4
      - importlib-metadata==6.8.0
@@ -71,4 +71,5 @@ dependencies:
      - typing-extensions==4.8.0
      - urllib3==2.0.6
      - zipp==3.17.0
+      - opencv-python
 prefix: /opt/conda/envs/diffusers
--- a/backend/python/diffusers/install.sh
+++ b/backend/python/diffusers/install.sh
@@ -31,8 +31,8 @@ if [ -d "/opt/intel" ]; then
                --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
    
    pip install google-api-python-client \
-                grpcio \
-                grpcio-tools \
+                grpcio==1.63.0 \
+                grpcio-tools==1.63.0 \
                diffusers==0.24.0 \
                transformers>=4.25.1 \
                accelerate \
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -1,11 +1,21 @@
 export CONDA_ENV_PATH = "exllama.yml"

 .PHONY: exllama
-exllama:
+exllama: protogen
 	bash install.sh ${CONDA_ENV_PATH}

 .PHONY: run
-run:
+run: protogen
 	@echo "Running exllama..."
 	bash run.sh
 	@echo "exllama run."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/exllama/backend_pb2.py
+++ b/backend/python/exllama/backend_pb2.py
--- a/backend/python/exllama/backend_pb2_grpc.py
+++ b/backend/python/exllama/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/exllama/exllama.yml
+++ b/backend/python/exllama/exllama.yml
@@ -27,7 +27,7 @@ dependencies:
  - pip:
      - filelock==3.12.4
      - fsspec==2023.9.2
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - jinja2==3.1.2
      - markupsafe==2.1.3
      - mpmath==1.3.0
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -1,10 +1,20 @@
 .PHONY: exllama2
-exllama2:
+exllama2: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh

 .PHONY: run
-run:
+run: protogen
 	@echo "Running exllama2..."
 	bash run.sh
 	@echo "exllama2 run."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/exllama2/backend_pb2.py
+++ b/backend/python/exllama2/backend_pb2.py
--- a/backend/python/exllama2/backend_pb2_grpc.py
+++ b/backend/python/exllama2/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/exllama2/exllama2.yml
+++ b/backend/python/exllama2/exllama2.yml
@@ -27,7 +27,7 @@ dependencies:
  - pip:
      - filelock==3.12.4
      - fsspec==2023.9.2
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - markupsafe==2.1.3
      - mpmath==1.3.0
      - networkx==3.1
--- a/backend/python/mamba/Makefile
+++ b/backend/python/mamba/Makefile
@@ -1,16 +1,26 @@
 .PHONY: mamba
-mamba:
+mamba: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh

 .PHONY: run
-run:
+run: protogen
 	@echo "Running mamba..."
 	bash run.sh
 	@echo "mamba run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing mamba..."
 	bash test.sh
-	@echo "mamba tested."
+	@echo "mamba tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/mamba/backend_pb2.py
+++ b/backend/python/mamba/backend_pb2.py
--- a/backend/python/mamba/backend_pb2_grpc.py
+++ b/backend/python/mamba/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/parler-tts/Makefile
+++ b/backend/python/parler-tts/Makefile
@@ -0,0 +1,39 @@
+export CONDA_ENV_PATH = "parler.yml"
+SKIP_CONDA?=0
+ifeq ($(BUILD_TYPE), cublas)
+export CONDA_ENV_PATH = "parler-nvidia.yml"
+endif
+
+# Intel GPU are supposed to have dependencies installed in the main python
+# environment, so we skip conda installation for SYCL builds.
+# https://github.com/intel/intel-extension-for-pytorch/issues/538
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
+.PHONY: parler-tts
+parler-tts: protogen
+	@echo "Installing $(CONDA_ENV_PATH)..."
+	bash install.sh $(CONDA_ENV_PATH)
+
+.PHONY: run
+run: protogen
+	@echo "Running transformers..."
+	bash run.sh
+	@echo "transformers run."
+
+.PHONY: test
+test: protogen
+	@echo "Testing transformers..."
+	bash test.sh
+	@echo "transformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+set -ex
+
+SKIP_CONDA=${SKIP_CONDA:-0}
+
+# Check if environment exist
+conda_env_exists(){
+    ! conda list --name "${@}" >/dev/null 2>/dev/null
+}
+
+if [ $SKIP_CONDA -eq 1 ]; then
+    echo "Skipping conda environment installation"
+else
+    export PATH=$PATH:/opt/conda/bin
+    if conda_env_exists "parler" ; then
+        echo "Creating virtual environment..."
+        conda env create --name parler --file $1
+        echo "Virtual environment created."
+    else 
+        echo "Virtual environment already exists."
+    fi
+fi
+
+if [ $SKIP_CONDA -ne 1 ]; then
+    # Activate conda environment
+    source activate parler
+    # https://github.com/descriptinc/audiotools/issues/101
+    # incompatible protobuf versions.
+    curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o $CONDA_PREFIX/lib/python3.11/site-packages/google/protobuf/internal/builder.py
+fi
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    if [ $SKIP_CONDA -ne 1 ]; then
+        # Activate conda environment
+        source activate parler
+    fi
+
+    pip cache purge
+fi
--- a/backend/python/parler-tts/parler-nvidia.yml
+++ b/backend/python/parler-tts/parler-nvidia.yml
@@ -0,0 +1,48 @@
+name: parler
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.08.22=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.11=h7f8727e_2
+  - pip=23.2.1=py311h06a4308_0
+  - python=3.11.5=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=68.0.0=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - tzdata=2023c=h04d1e81_0
+  - wheel=0.41.2=py311h06a4308_0
+  - xz=5.4.2=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+      - accelerate>=0.11.0
+      - grpcio==1.63.0
+      - numpy==1.26.0
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.18.1
+      - nvidia-nvjitlink-cu12==12.2.140
+      - nvidia-nvtx-cu12==12.1.105
+      - torch==2.1.0
+      - transformers>=4.34.0
+      - descript-audio-codec
+      - sentencepiece
+      - git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
+prefix: /opt/conda/envs/diffusers
--- a/backend/python/parler-tts/parler.yml
+++ b/backend/python/parler-tts/parler.yml
@@ -0,0 +1,36 @@
+name: parler
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.08.22=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.11=h7f8727e_2
+  - pip=23.2.1=py311h06a4308_0
+  - python=3.11.5=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=68.0.0=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - tzdata=2023c=h04d1e81_0
+  - wheel=0.41.2=py311h06a4308_0
+  - xz=5.4.2=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+      - accelerate>=0.11.0
+      - numpy==1.26.0
+      - grpcio==1.63.0
+      - torch==2.1.0
+      - transformers>=4.34.0
+      - descript-audio-codec
+      - sentencepiece
+      - git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
+prefix: /opt/conda/envs/parler
--- a/backend/python/parler-tts/parler_tts_server.py
+++ b/backend/python/parler-tts/parler_tts_server.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+Extra gRPC server for MusicgenForConditionalGeneration models.
+"""
+from concurrent import futures
+
+import argparse
+import signal
+import sys
+import os
+
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+from scipy.io.wavfile import write as write_wav
+
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer
+import soundfile as sf  
+import torch
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer for the backend service.
+
+    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
+    """
+    def Health(self, request, context):
+        """
+        A gRPC method that returns the health status of the backend service.
+
+        Args:
+            request: A HealthRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Reply object that contains the health status of the backend service.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    def LoadModel(self, request, context):
+        """
+        A gRPC method that loads a model into memory.
+
+        Args:
+            request: A LoadModelRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Result object that contains the result of the LoadModel operation.
+        """
+        model_name = request.Model
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        try:
+            self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def TTS(self, request, context):
+        model_name = request.model
+        voice = request.voice
+        if voice == "":
+            voice = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
+        if model_name == "":
+            return backend_pb2.Result(success=False, message="request.model is required")
+        try:
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            input_ids = self.tokenizer(voice, return_tensors="pt").input_ids.to(device)
+            prompt_input_ids = self.tokenizer(request.text, return_tensors="pt").input_ids.to(device)
+           
+            generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+            audio_arr = generation.cpu().numpy().squeeze()
+            print("[parler-tts] TTS generated!", file=sys.stderr)
+            sf.write(request.dst, audio_arr, self.model.config.sampling_rate)
+            print("[parler-tts] TTS saved to", request.dst, file=sys.stderr)
+            print("[parler-tts] TTS for", file=sys.stderr)
+            print(request, file=sys.stderr)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(success=True)
+
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("[parler-tts] Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("[parler-tts] Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+    print(f"[parler-tts] startup: {args}", file=sys.stderr)
+    serve(args.addr)
--- a/backend/python/parler-tts/run.sh
+++ b/backend/python/parler-tts/run.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+##
+## A bash script wrapper that runs the parler-tts server with conda
+
+echo "Launching gRPC server for parler-tts"
+
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate parler
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python $DIR/parler_tts_server.py $@
--- a/backend/python/parler-tts/test.sh
+++ b/backend/python/parler-tts/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+##
+## A bash script wrapper that runs the transformers server with conda
+
+# Activate conda environment
+source activate parler
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python -m unittest $DIR/test_parler.py
--- a/backend/python/parler-tts/test_parler.py
+++ b/backend/python/parler-tts/test_parler.py
@@ -0,0 +1,81 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "parler_tts_server.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_tts(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
+                self.assertTrue(response.success)
+                tts_request = backend_pb2.TTSRequest(text="Hey, how are you doing today?")
+                tts_response = stub.TTS(tts_request)
+                self.assertIsNotNone(tts_response)
+        except Exception as err:
+            print(err)
+            self.fail("TTS service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/petals/Makefile
+++ b/backend/python/petals/Makefile
@@ -1,17 +1,27 @@
 .PHONY: petals
-petals:
+petals: protogen
 	@echo "Creating virtual environment..."
 	bash install.sh "petals.yml"
 	@echo "Virtual environment created."

 .PHONY: run
-run:
+run: protogen
 	@echo "Running petals..."
 	bash run.sh
 	@echo "petals run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing petals..."
 	bash test.sh
 	@echo "petals tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/petals/backend_pb2.py
+++ b/backend/python/petals/backend_pb2.py
--- a/backend/python/petals/backend_pb2_grpc.py
+++ b/backend/python/petals/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/rerankers/Makefile
+++ b/backend/python/rerankers/Makefile
@@ -0,0 +1,27 @@
+.PHONY: rerankers
+rerankers: protogen
+	$(MAKE) -C ../common-env/transformers
+
+
+.PHONY: run
+run: protogen
+	@echo "Running rerankers..."
+	bash run.sh
+	@echo "rerankers run."
+
+# It is not working well by using command line. It only6 works with IDE like VSCode.
+.PHONY: test
+test: protogen
+	@echo "Testing rerankers..."
+	bash test.sh
+	@echo "rerankers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/rerankers/README.md
+++ b/backend/python/rerankers/README.md
@@ -0,0 +1,5 @@
+# Creating a separate environment for the reranker project
+
+```
+make reranker
+```
--- a/backend/python/rerankers/reranker.py
+++ b/backend/python/rerankers/reranker.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""
+Extra gRPC server for Rerankers models.
+"""
+from concurrent import futures
+
+import argparse
+import signal
+import sys
+import os
+
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+from rerankers import Reranker
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer for the backend service.
+
+    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
+    """
+    def Health(self, request, context):
+        """
+        A gRPC method that returns the health status of the backend service.
+
+        Args:
+            request: A HealthRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Reply object that contains the health status of the backend service.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    def LoadModel(self, request, context):
+        """
+        A gRPC method that loads a model into memory.
+
+        Args:
+            request: A LoadModelRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Result object that contains the result of the LoadModel operation.
+        """
+        model_name = request.Model
+        try:
+            kwargs = {}
+            if request.Type != "":
+                kwargs['model_type'] = request.Type
+            if request.PipelineType != "": # Reuse the PipelineType field for language
+                kwargs['lang'] = request.PipelineType
+            self.model_name = model_name
+            self.model = Reranker(model_name, **kwargs)  
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        # Implement your logic here for the LoadModel service
+        # Replace this with your desired response
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def Rerank(self, request, context):
+        documents = []
+        for idx, doc in enumerate(request.documents):
+            documents.append(doc)
+        ranked_results=self.model.rank(query=request.query, docs=documents, doc_ids=list(range(len(request.documents))))
+        # Prepare results to return
+        results = [
+            backend_pb2.DocumentResult(
+                index=res.doc_id,
+                text=res.text,
+                relevance_score=res.score
+            ) for res in ranked_results.results
+        ]
+
+        # Calculate the usage and total tokens
+        # TODO: Implement the usage calculation with reranker
+        total_tokens = sum(len(doc.split()) for doc in request.documents) + len(request.query.split())
+        prompt_tokens = len(request.query.split())
+        usage = backend_pb2.Usage(total_tokens=total_tokens, prompt_tokens=prompt_tokens)
+        return backend_pb2.RerankResult(usage=usage, results=results)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/backend/python/rerankers/run.sh
+++ b/backend/python/rerankers/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+##
+## A bash script wrapper that runs the reranker server with conda
+
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate transformers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python $DIR/reranker.py $@
--- a/backend/python/rerankers/test.sh
+++ b/backend/python/rerankers/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+##
+## A bash script wrapper that runs the reranker server with conda
+
+# Activate conda environment
+source activate transformers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python -m unittest $DIR/test_reranker.py
--- a/backend/python/rerankers/test_reranker.py
+++ b/backend/python/rerankers/test_reranker.py
@@ -0,0 +1,90 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "reranker.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.kill()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="cross-encoder"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_rerank(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                request = backend_pb2.RerankRequest(
+                    query="I love you",
+                    documents=["I hate you", "I really like you"],
+                    top_n=2
+                )
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="cross-encoder"))
+                self.assertTrue(response.success)
+               
+                rerank_response = stub.Rerank(request)
+                print(rerank_response.results[0])
+                self.assertIsNotNone(rerank_response.results)
+                self.assertEqual(len(rerank_response.results), 2)
+                self.assertEqual(rerank_response.results[0].text, "I really like you")
+                self.assertEqual(rerank_response.results[1].text, "I hate you")
+        except Exception as err:
+            print(err)
+            self.fail("Reranker service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/sentencetransformers/Makefile
+++ b/backend/python/sentencetransformers/Makefile
@@ -1,17 +1,27 @@
 .PHONY: sentencetransformers
-sentencetransformers:
+sentencetransformers: protogen
 	$(MAKE) -C ../common-env/transformers


 .PHONY: run
-run:
+run: protogen
 	@echo "Running sentencetransformers..."
 	bash run.sh
 	@echo "sentencetransformers run."

 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing sentencetransformers..."
 	bash test.sh
 	@echo "sentencetransformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/sentencetransformers/backend_pb2.py
+++ b/backend/python/sentencetransformers/backend_pb2.py
--- a/backend/python/sentencetransformers/backend_pb2_grpc.py
+++ b/backend/python/sentencetransformers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers-musicgen/Makefile
+++ b/backend/python/transformers-musicgen/Makefile
@@ -1,16 +1,25 @@
-
 .PHONY: transformers-musicgen
-transformers-musicgen:
+transformers-musicgen: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/transformers-musicgen/backend_pb2.py
+++ b/backend/python/transformers-musicgen/backend_pb2.py
--- a/backend/python/transformers-musicgen/backend_pb2_grpc.py
+++ b/backend/python/transformers-musicgen/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers-musicgen/run.sh
+++ b/backend/python/transformers-musicgen/run.sh
@@ -8,7 +8,7 @@ echo "Launching gRPC server for transformers-musicgen"
 export PATH=$PATH:/opt/conda/bin

 # Activate conda environment
-source activate transformers-musicgen
+source activate transformers

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/transformers/Makefile
+++ b/backend/python/transformers/Makefile
@@ -1,16 +1,26 @@
 .PHONY: transformers
-transformers:
+transformers: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."

 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/transformers/backend_pb2.py
+++ b/backend/python/transformers/backend_pb2.py
--- a/backend/python/transformers/backend_pb2_grpc.py
+++ b/backend/python/transformers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -89,8 +89,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        quantization = None

        if self.CUDA:
-            if request.Device:
-                device_map=request.Device
+            if request.MainGPU:
+                device_map=request.MainGPU
            else:
                device_map="cuda:0"
            if request.Quantization == "bnb_4bit":
@@ -143,12 +143,37 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                from optimum.intel.openvino import OVModelForCausalLM
                from openvino.runtime import Core

-                if "GPU" in Core().available_devices:
-                    device_map="GPU"
+                if request.MainGPU:
+                    device_map=request.MainGPU
                else:
-                    device_map="CPU"
+                    device_map="AUTO"
+                    devices = Core().available_devices
+                    if "GPU" in " ".join(devices):
+                        device_map="AUTO:GPU"
+
                self.model = OVModelForCausalLM.from_pretrained(model_name, 
-                                                                compile=True, 
+                                                                compile=True,
+                                                                trust_remote_code=request.TrustRemoteCode,
+                                                                ov_config={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT","GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}, 
+                                                                device=device_map)
+                self.OV = True
+            elif request.Type == "OVModelForFeatureExtraction":
+                from optimum.intel.openvino import OVModelForFeatureExtraction
+                from openvino.runtime import Core
+
+                if request.MainGPU:
+                    device_map=request.MainGPU
+                else:
+                    device_map="AUTO"
+                    devices = Core().available_devices
+                    if "GPU" in " ".join(devices):
+                        device_map="AUTO:GPU"
+
+                self.model = OVModelForFeatureExtraction.from_pretrained(model_name, 
+                                                                compile=True,
+                                                                trust_remote_code=request.TrustRemoteCode,
+                                                                ov_config={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT", "GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}, 
+                                                                export=True,
                                                                device=device_map)
                self.OV = True
            else:
@@ -158,6 +183,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                                                       quantization_config=quantization, 
                                                       device_map=device_map, 
                                                       torch_dtype=compute)
+            if request.ContextSize > 0:
+                self.max_tokens = request.ContextSize
+            else:
+                self.max_tokens = self.model.config.max_position_embeddings
+ 
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
            self.XPU = False

@@ -212,12 +242,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        set_seed(request.Seed)
        if request.TopP == 0:
            request.TopP = 0.9
+        
+        if request.TopK == 0:
+            request.TopK = 40
+
+        prompt = request.Prompt
+        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:    
+            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
+
+        eos_token_id = self.tokenizer.eos_token_id
+        if request.StopPrompts:
+            eos_token_id = []
+            for word in request.StopPrompts:
+                eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))
+
+        inputs = self.tokenizer(prompt, return_tensors="pt")

-        max_tokens = 200
        if request.Tokens > 0:
            max_tokens = request.Tokens
+        else:
+            max_tokens = self.max_tokens - inputs["input_ids"].size()[inputs["input_ids"].dim()-1]

-        inputs = self.tokenizer(request.Prompt, return_tensors="pt")
        if self.CUDA:
            inputs = inputs.to("cuda")
        if XPU and self.OV == False:
@@ -235,7 +280,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        top_k=request.TopK, 
                        do_sample=True,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=self.tokenizer.eos_token_id,
+                        eos_token_id=eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id,
                        streamer=streamer)
            thread=Thread(target=self.model.generate, kwargs=config)
@@ -264,7 +309,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        top_k=request.TopK, 
                        do_sample=True,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=self.tokenizer.eos_token_id,
+                        eos_token_id=eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id)
            generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]

@@ -334,4 +379,4 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()

-    asyncio.run(serve(args.addr))
+    asyncio.run(serve(args.addr))
--- a/backend/python/vall-e-x/Makefile
+++ b/backend/python/vall-e-x/Makefile
@@ -3,18 +3,28 @@ export SKIP_CONDA=1
 endif

 .PHONY: ttsvalle
-ttsvalle:
+ttsvalle: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh

 .PHONY: run
-run:
+run: protogen
 	@echo "Running ttsvalle..."
 	bash run.sh
 	@echo "ttsvalle run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing valle..."
 	bash test.sh
 	@echo "valle tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/vall-e-x/backend_pb2.py
+++ b/backend/python/vall-e-x/backend_pb2.py
--- a/Show More
+++ b/Show More