fix: newline in virtual.yaml

Stupid one line fix, but it will fix CI Signed-off-by: Dave <dave@gray101.com>
feat(swagger): update swagger (#2128 )
2026-05-24 16:51:44 -04:00 · 2024-04-25 10:39:07 -04:00 · 2024-04-25 16:10:08 +02:00 · 2024-04-25 16:06:18 +02:00 · 2024-04-25 16:05:02 +02:00 · 2024-04-25 15:57:06 +02:00
204 changed files with 6746 additions and 11238 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,6 +1,11 @@
 .idea
+.github
+.vscode
 models
 examples/chatbot-ui/models
 examples/rwkv/models
 examples/**/models
-Dockerfile*
+Dockerfile*
+
+# SonarQube
+.scannerwork
--- a/.env
+++ b/.env
@@ -1,33 +1,33 @@
 ## Set number of threads.
 ## Note: prefer the number of physical cores. Overbooking the CPU degrades performance notably.
-# THREADS=14
+# LOCALAI_THREADS=14

 ## Specify a different bind address (defaults to ":8080")
-# ADDRESS=127.0.0.1:8080
+# LOCALAI_ADDRESS=127.0.0.1:8080

 ## Default models context size
-# CONTEXT_SIZE=512
+# LOCALAI_CONTEXT_SIZE=512
 #
 ## Define galleries.
 ## models will to install will be visible in `/models/available`
-# GALLERIES=[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}]
+# LOCALAI_GALLERIES=[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}]

 ## CORS settings
-# CORS=true
-# CORS_ALLOW_ORIGINS=*
+# LOCALAI_CORS=true
+# LOCALAI_CORS_ALLOW_ORIGINS=*

 ## Default path for models
 #
-# MODELS_PATH=/models
+# LOCALAI_MODELS_PATH=/models

 ## Enable debug mode
-# DEBUG=true
+# LOCALAI_LOG_LEVEL=debug

 ## Disables COMPEL (Diffusers)
 # COMPEL=0

 ## Enable/Disable single backend (useful if only one GPU is available)
-# SINGLE_ACTIVE_BACKEND=true
+# LOCALAI_SINGLE_ACTIVE_BACKEND=true

 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
@@ -46,13 +46,13 @@
 # GO_TAGS=stablediffusion

 ## Path where to store generated images
-# IMAGE_PATH=/tmp
+# LOCALAI_IMAGE_PATH=/tmp/generated/images

 ## Specify a default upload limit in MB (whisper)
-# UPLOAD_LIMIT
+# LOCALAI_UPLOAD_LIMIT=15

 ## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
-# EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
+# LOCALAI_EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py

 ### Advanced settings ###
 ### Those are not really used by LocalAI, but from components in the stack ###
@@ -72,18 +72,18 @@
 # LLAMACPP_PARALLEL=1

 ### Enable to run parallel requests
-# PARALLEL_REQUESTS=true
+# LOCALAI_PARALLEL_REQUESTS=true

 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
-# WATCHDOG_IDLE=true
-#
-# Enables watchdog to kill backends that are busy for too much time
-# WATCHDOG_BUSY=true
+# LOCALAI_WATCHDOG_IDLE=true
 #
 # Time in duration format (e.g. 1h30m) after which a backend is considered idle
-# WATCHDOG_IDLE_TIMEOUT=5m
+# LOCALAI_WATCHDOG_IDLE_TIMEOUT=5m
+#
+# Enables watchdog to kill backends that are busy for too much time
+# LOCALAI_WATCHDOG_BUSY=true
 #
 # Time in duration format (e.g. 1h30m) after which a backend is considered busy
-# WATCHDOG_BUSY_TIMEOUT=5m
+# LOCALAI_WATCHDOG_BUSY_TIMEOUT=5m
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,25 @@
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+version: 2
+updates:
+  - package-ecosystem: "gomod"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "github-actions"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -49,7 +49,7 @@ jobs:
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -17,7 +17,7 @@ jobs:
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -0,0 +1,43 @@
+name: Dependabot auto-merge
+on:
+- pull_request_target
+
+permissions:
+  contents: write
+  pull-requests: write
+  packages: read
+
+jobs:
+  dependabot:
+    runs-on: ubuntu-latest
+    if: ${{ github.actor == 'dependabot[bot]' }}
+    steps:
+      - name: Dependabot metadata
+        id: metadata
+        uses: dependabot/fetch-metadata@v2.0.0
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          skip-commit-verification: true
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Approve a PR if not already approved
+        run: |
+          gh pr checkout "$PR_URL"
+            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
+          then
+            gh pr review --approve "$PR_URL"
+          else
+            echo "PR already approved.";
+          fi
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+
+      - name: Enable auto-merge for Dependabot PRs
+        if: ${{ contains(github.event.pull_request.title, 'bump')}}
+        run: gh pr merge --auto --squash "$PR_URL"
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -0,0 +1,90 @@
+name: 'generate and publish GRPC docker caches'
+
+on:
+- workflow_dispatch
+
+concurrency:
+  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  generate_caches:
+    strategy:
+      matrix:
+        include:
+          - grpc-base-image: ubuntu:22.04
+            runs-on: 'ubuntu-latest'
+            platforms: 'linux/amd64'
+    runs-on: ${{matrix.runs-on}}
+    steps:
+      - name: Release space from worker
+        if: matrix.runs-on == 'ubuntu-latest'
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get remove -y microsoft-edge-stable || true
+          sudo apt-get remove -y firefox || true
+          sudo apt-get remove -y powershell || true
+          sudo apt-get remove -y r-base-core || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf "/usr/local/share/boost" || true
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+          df -h
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache GRPC
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          build-args: |
+            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
+            MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.58.0
+          context: .
+          file: ./Dockerfile
+          cache-to: type=gha,ignore-error=true
+          target: grpc
+          platforms: ${{ matrix.platforms }}
+          push: false
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -22,6 +22,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
@@ -61,12 +62,14 @@ jobs:
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -85,6 +88,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
@@ -102,11 +106,12 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -122,4 +127,4 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -26,6 +26,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      aio: ${{ matrix.aio }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
@@ -129,6 +130,7 @@ jobs:
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
@@ -140,12 +142,14 @@ jobs:
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -158,6 +162,7 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -171,6 +176,7 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-core'
            ffmpeg: 'false'
            image-type: 'core'
@@ -180,6 +186,7 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-core'
            ffmpeg: 'false'
            image-type: 'core'
@@ -189,6 +196,7 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -198,6 +206,7 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -210,6 +219,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
@@ -219,6 +229,7 @@ jobs:
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  
@@ -236,6 +247,7 @@ jobs:
      runs-on: ${{ matrix.runs-on }}
      aio: ${{ matrix.aio }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
@@ -258,7 +270,7 @@ jobs:
            aio: "-aio-cpu"
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -269,7 +281,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -280,7 +292,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -291,7 +303,7 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -302,4 +314,4 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -6,6 +6,10 @@ on:
    inputs:
      base-image:
        description: 'Base image'
+        required: true
+        type: string
+      grpc-base-image:
+        description: 'GRPC Base image, must be a compatible image with base-image'
        required: false
        default: ''
        type: string
@@ -57,7 +61,7 @@ on:
      makeflags:
        description: 'Make Flags'
        required: false
-        default: '--jobs=3 --output-sync=target'
+        default: '--jobs=4 --output-sync=target'
        type: string
      aio:
        description: 'AIO Image Name'
@@ -201,15 +205,16 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
          build-args: |
-            IMAGE_TYPE=${{ inputs.image-type }}
-            BASE_IMAGE=${{ inputs.base-image }}
-            MAKEFLAGS=${{ inputs.makeflags }}
+            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
+            MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.58.0
          context: .
          file: ./Dockerfile
          cache-from: type=gha
-          cache-to: type=gha,ignore-error=true
          target: grpc
          platforms: ${{ inputs.platforms }}
          push: false
--- a/.github/workflows/localaibot_automerge.yml
+++ b/.github/workflows/localaibot_automerge.yml
@@ -0,0 +1,35 @@
+name: LocalAI-bot auto-merge
+on:
+- pull_request_target
+
+permissions:
+  contents: write
+  pull-requests: write
+  packages: read
+
+jobs:
+  dependabot:
+    runs-on: ubuntu-latest
+    if: ${{ github.actor == 'localai-bot' }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Approve a PR if not already approved
+        run: |
+          gh pr checkout "$PR_URL"
+            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
+          then
+            gh pr review --approve "$PR_URL"
+          else
+            echo "PR already approved.";
+          fi
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+
+      - name: Enable auto-merge for LocalAIBot PRs
+        run: gh pr merge --auto --squash "$PR_URL"
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,6 +1,8 @@
 name: Build and Release

-on: push
+on: 
+- push
+- pull_request

 env:
  GRPC_VERSION: v1.58.0
@@ -33,14 +35,14 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install build-essential ffmpeg protobuf-compiler
      - name: Install CUDA Dependencies
        if: ${{ matrix.build == 'cuda12' || matrix.build == 'cuda11' }}
        run: |
@@ -55,7 +57,7 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
      - name: Cache grpc
        id: cache-grpc
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        with:
          path: grpc
          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
@@ -75,6 +77,9 @@ jobs:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          export PATH=$PATH:$GOPATH/bin
          if [ "${{ matrix.build }}" == "cuda12" ] || [ "${{ matrix.build }}" == "cuda11" ]; then
            export BUILD_TYPE=cublas
            export PATH=/usr/local/cuda/bin:$PATH
@@ -82,12 +87,12 @@ jobs:
          else
            STATIC=true make dist
          fi
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.build }}
+          name: LocalAI-linux-${{ matrix.build }}
          path: release/
      - name: Release
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
@@ -100,27 +105,24 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
-          sudo apt-get install -y --no-install-recommends libopencv-dev
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build stablediffusion
        run: |
+          export PATH=$PATH:$GOPATH/bin
          make backend-assets/grpc/stablediffusion
          mkdir -p release && cp backend-assets/grpc/stablediffusion release
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
          name: stablediffusion
          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v1
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*

  build-macOS:
    strategy:
@@ -138,13 +140,15 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          brew install protobuf grpc
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build
        id: build
        env:
@@ -153,13 +157,61 @@ jobs:
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
+          export PATH=$PATH:$GOPATH/bin
          make dist
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.build }}
+          name: LocalAI-MacOS-${{ matrix.build }}
          path: release/
      - name: Release
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
+
+
+  build-macOS-arm64:
+    strategy:
+      matrix:
+        include:
+          - build: 'avx2'
+            defines: ''
+          - build: 'avx'
+            defines: '-DLLAMA_AVX2=OFF'
+          - build: 'avx512'
+            defines: '-DLLAMA_AVX512=ON'
+    runs-on: macos-14
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
+      - name: Dependencies
+        run: |
+          brew install protobuf grpc
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+      - name: Build
+        id: build
+        env:
+          CMAKE_ARGS: "${{ matrix.defines }}"
+          BUILD_ID: "${{ matrix.build }}"
+        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
+          export PATH=$PATH:$GOPATH/bin
+          make dist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: LocalAI-MacOS-arm64-${{ matrix.build }}
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -14,14 +14,17 @@ jobs:
      GO111MODULE: on
    steps:
      - name: Checkout Source
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
+        if: ${{ github.actor != 'dependabot[bot]' }}
        uses: securego/gosec@master
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
      - name: Upload SARIF file
-        uses: github/codeql-action/upload-sarif@v2
+        if: ${{ github.actor != 'dependabot[bot]' }}
+        uses: github/codeql-action/upload-sarif@v3
        with:
          # Path to SARIF file relative to the root of the repository
-          sarif_file: results.sarif
+          sarif_file: results.sarif
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -32,8 +32,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
          
          sudo rm -rfv /usr/bin/conda || true

@@ -61,8 +62,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
          
          sudo rm -rfv /usr/bin/conda || true

@@ -72,6 +74,37 @@ jobs:
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test

+
+  tests-rerankers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test rerankers
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make --jobs=5 --output-sync=target -C backend/python/rerankers
+           make --jobs=5 --output-sync=target -C backend/python/rerankers test
+
  tests-diffusers:
    runs-on: ubuntu-latest
    steps:
@@ -90,8 +123,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
          
          sudo rm -rfv /usr/bin/conda || true

@@ -101,6 +135,35 @@ jobs:
           make --jobs=5 --output-sync=target -C backend/python/diffusers
           make --jobs=5 --output-sync=target -C backend/python/diffusers test

+  tests-parler-tts:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test parler-tts
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts test

  tests-transformers-musicgen:
    runs-on: ubuntu-latest
@@ -120,8 +183,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
          
          sudo rm -rfv /usr/bin/conda || true

@@ -151,8 +215,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools
          
  #         sudo rm -rfv /usr/bin/conda || true

@@ -222,8 +287,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools
          
  #         sudo rm -rfv /usr/bin/conda || true

@@ -254,8 +320,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools
  #         sudo rm -rfv /usr/bin/conda || true
  #     - name: Test vllm
  #       run: |
@@ -280,8 +347,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev    
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true
      - name: Test vall-e-x
        run: |
@@ -307,7 +375,8 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng          
+          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
+          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true

      - name: Test coqui
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -60,7 +60,7 @@ jobs:
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v4
+        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go-version }}
          cache: false
@@ -70,17 +70,27 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install build-essential curl ffmpeg
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
          sudo apt-get install -y libopencv-dev
-          
+
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+
+          # The python3-grpc-tools package in 22.04 is too old
+          pip install --user grpcio-tools
+
          sudo rm -rfv /usr/bin/conda || true
          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers

@@ -89,10 +99,10 @@ jobs:
          GO_TAGS="tts" make -C sources/go-piper piper.o && \
          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
-          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
      - name: Cache grpc
        id: cache-grpc
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        with:
          path: grpc
          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
@@ -108,11 +118,14 @@ jobs:
          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
-        timeout-minutes: 5
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true

  tests-aio-container:
    runs-on: ubuntu-latest
@@ -163,8 +176,11 @@ jobs:
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
-        timeout-minutes: 5
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true

  tests-apple:
    runs-on: macOS-14
@@ -177,7 +193,7 @@ jobs:
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v4
+        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go-version }}
          cache: false
@@ -186,7 +202,8 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc make
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
+          pip install --user grpcio-tools
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
@@ -196,5 +213,8 @@ jobs:
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
-        timeout-minutes: 5
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -0,0 +1,31 @@
+name: Update swagger
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  swagger:
+    strategy:
+      fail-fast: false
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version: 'stable'
+      - run: |
+          go install github.com/swaggo/swag/cmd/swag@latest
+      - name: Bump swagger 🔧
+        run: |
+          make swagger
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: 'feat(swagger): update swagger'
+          title: 'feat(swagger): update swagger'
+          branch: "update/swagger"
+          body:  Update swagger
+          signoff: true
+
--- a/.github/workflows/yaml-check.yml
+++ b/.github/workflows/yaml-check.yml
@@ -0,0 +1,18 @@
+name: 'Yamllint GitHub Actions'
+on:
+  - pull_request
+jobs:
+  yamllint:
+    name: 'Yamllint'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@master
+      - name: 'Yamllint'
+        uses: karancode/yamllint-github-action@master
+        with:
+          yamllint_file_or_dir: 'gallery'
+          yamllint_strict: false
+          yamllint_comment: true
+        env:
+          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,11 @@ backend-assets/*
 !backend-assets/.keep
 prepare
 /ggml-metal.metal
+
+# Protobuf generated files
+*.pb.go
+*pb2.py
+*pb2_grpc.py
+
+# SonarQube
+.scannerwork
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-# Contributing to localAI
+# Contributing to LocalAI

 Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.

@@ -29,8 +29,9 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time

 1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
 2. Navigate to the project directory: `cd LocalAI`
-3. Install the required dependencies: `make prepare`
-4. Run LocalAI: `make run`
+3. Install the required dependencies ( see https://localai.io/basics/build/#build-localai-locally )
+4. Build LocalAI: `make build`
+5. Run LocalAI: `./local-ai`

 ## Contributing

@@ -59,14 +60,29 @@ If you find a bug, have a feature request, or encounter any issues, please check

 `make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.

+### Running AIO tests
+
+All-In-One images has a set of tests that automatically verifies that most of the endpoints works correctly, a flow can be :
+
+```bash
+# Build the LocalAI docker image
+make DOCKER_IMAGE=local-ai docker
+
+# Build the corresponding AIO image
+BASE_IMAGE=local-ai DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
+
+# Run the AIO e2e tests
+LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio make run-e2e-aio
+```
+
 ## Documentation

- We are welcome the contribution of the documents, please open new PR in the official document repo [localai-website](https://github.com/go-skynet/localai-website)
-
+We are welcome the contribution of the documents, please open new PR or create a new issue. The documentation is available under `docs/` https://github.com/mudler/LocalAI/tree/master/docs
+ 
 ## Community and Communication

 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)

---
+---
--- a/65
+++ b/65
@@ -1,8 +1,9 @@
 ARG IMAGE_TYPE=extras
 ARG BASE_IMAGE=ubuntu:22.04
+ARG GRPC_BASE_IMAGE=${BASE_IMAGE}

 # extras or core
-FROM ${BASE_IMAGE} as requirements-core
+FROM ${BASE_IMAGE} AS requirements-core

 USER root

@@ -15,17 +16,30 @@ ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"

 ARG GO_TAGS="stablediffusion tinydream tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean
+    apt-get install -y ca-certificates curl python3-pip unzip && apt-get clean

 # Install Go
-RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
+RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH $PATH:/usr/local/go/bin

+# Install grpc compilers
+ENV PATH $PATH:/root/go/bin
+RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@latest && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+
+# Install protobuf (the version in 22.04 is too old)
+RUN curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+    unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+    rm protoc.zip
+
+# Install grpcio-tools (the version in 22.04 is too old)
+RUN pip install --user grpcio-tools
+
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates

@@ -66,9 +80,10 @@ RUN test -n "$TARGETARCH" \
 ###################################
 ###################################

-FROM requirements-core as requirements-extras
+FROM requirements-core AS requirements-extras

-RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+RUN apt install -y gpg && \
+    curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
@@ -90,7 +105,7 @@ RUN if [ ! -e /usr/bin/python ]; then \
 ###################################
 ###################################

-FROM ${BASE_IMAGE} as grpc
+FROM ${GRPC_BASE_IMAGE} AS grpc

 ARG MAKEFLAGS
 ARG GRPC_VERSION=v1.58.0
@@ -100,22 +115,21 @@ ENV MAKEFLAGS=${MAKEFLAGS}
 WORKDIR /build

 RUN apt-get update && \
-    apt-get install -y g++ cmake git && \
+    apt-get install -y build-essential cmake git  && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

 RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc

-RUN cd grpc && \
-    mkdir -p cmake/build && \
-    cd cmake/build && \
-    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF ../.. && \
+WORKDIR /build/grpc/cmake/build
+
+RUN cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF ../.. && \
    make

 ###################################
 ###################################

-FROM requirements-${IMAGE_TYPE} as builder
+FROM requirements-${IMAGE_TYPE} AS builder

 ARG GO_TAGS="stablediffusion tts"
 ARG GRPC_BACKENDS
@@ -133,6 +147,12 @@ WORKDIR /build
 COPY . .
 COPY .git .
 RUN echo "GO_TAGS: $GO_TAGS"
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake git  && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 RUN make prepare

 # If we are building with clblas support, we need the libraries for the builds
@@ -147,9 +167,11 @@ RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

 COPY --from=grpc /build/grpc ./grpc/

-RUN cd /build/grpc/cmake/build && make install
+WORKDIR /build/grpc/cmake/build
+RUN make install

 # Rebuild with defaults backends
+WORKDIR /build
 RUN make build

 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
@@ -191,6 +213,11 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
    apt-get clean \
    ; fi

+RUN apt-get update && \
+    apt-get install -y cmake git  && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 WORKDIR /build

 # we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
@@ -202,7 +229,7 @@ COPY . .
 COPY --from=builder /build/sources ./sources/
 COPY --from=grpc /build/grpc ./grpc/

-RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf grpc
+RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf /build/grpc

 # Copy the binary
 COPY --from=builder /build/local-ai ./
@@ -232,6 +259,9 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/sentencetransformers \
    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+    make -C backend/python/rerankers \
+    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/transformers \
    ; fi
@@ -250,6 +280,9 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/transformers-musicgen \
    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+    make -C backend/python/parler-tts \
+    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/coqui \
    ; fi
@@ -259,7 +292,7 @@ RUN mkdir -p /build/models

 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
-  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
+  CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1
  
 VOLUME /build/models
 EXPOSE 8080
--- a/228
+++ b/228
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai

 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=1b67731e184e27a465b8c5476061294a4af668ea
+CPPLLAMA_VERSION?=784e11dea1f5ce9638851b2b0dddb107e2a609c8

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -16,7 +16,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=8f253ef3af1c62c04316ba4afa7145fc4d701a8c
+WHISPER_CPP_VERSION?=858452d58dba3acdc3431c9bced2bb8cfd9bf418

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
@@ -179,20 +179,20 @@ endif
 all: help

 ## BERT embeddings
-sources/go-bert:
-	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
-	cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-bert.cpp:
+	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert.cpp
+	cd sources/go-bert.cpp && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-bert/libgobert.a: sources/go-bert
-	$(MAKE) -C sources/go-bert libgobert.a
+sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
+	$(MAKE) -C sources/go-bert.cpp libgobert.a

-## go-llama-ggml
-sources/go-llama-ggml:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
-	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
+## go-llama.cpp
+sources/go-llama.cpp:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama.cpp
+	cd sources/go-llama.cpp && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
-	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
+sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
+	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

 ## go-piper
 sources/go-piper:
@@ -211,12 +211,12 @@ sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## RWKV
-sources/go-rwkv:
-	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv
-	cd sources/go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-rwkv.cpp:
+	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv.cpp
+	cd sources/go-rwkv.cpp && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-rwkv/librwkv.a: sources/go-rwkv
-	cd sources/go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
+sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
+	cd sources/go-rwkv.cpp && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

 ## stable diffusion
 sources/go-stable-diffusion:
@@ -236,23 +236,24 @@ sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream

 ## whisper
 sources/whisper.cpp:
-	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
+	git clone https://github.com/ggerganov/whisper.cpp sources/whisper.cpp
 	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1

 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && make libwhisper.a

-get-sources: sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
+get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream

 replace:
-	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp

 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
@@ -271,12 +272,12 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C sources/go-llama-ggml clean
+	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
-	$(MAKE) -C sources/go-rwkv clean
+	$(MAKE) -C sources/go-rwkv.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
-	$(MAKE) -C sources/go-bert clean
+	$(MAKE) -C sources/go-bert.cpp clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build
@@ -289,10 +290,12 @@ clean: ## Remove build related file
 	rm -rf ./sources
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
-	rm -rf backend-assets
+	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/cpp/llama clean
 	$(MAKE) dropreplace
+	$(MAKE) protogen-clean
+	rmdir pkg/grpc/proto || true

 clean-tests:
 	rm -rf test-models
@@ -416,30 +419,152 @@ help: ## Show this help.
 		else if (/^## .*$$/) {printf "  ${CYAN}%s${RESET}\n", substr($$1,4)} \
 		}' $(MAKEFILE_LIST)

+.PHONY: protogen
 protogen: protogen-go protogen-python

+.PHONY: protogen-clean
+protogen-clean: protogen-go-clean protogen-python-clean
+
+.PHONY: protogen-go
 protogen-go:
+	mkdir -p pkg/grpc/proto
 	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto

-protogen-python:
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/sentencetransformers/ --grpc_python_out=backend/python/sentencetransformers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers/ --grpc_python_out=backend/python/transformers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers-musicgen/ --grpc_python_out=backend/python/transformers-musicgen/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/autogptq/ --grpc_python_out=backend/python/autogptq/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama/ --grpc_python_out=backend/python/exllama/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/bark/ --grpc_python_out=backend/python/bark/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/diffusers/ --grpc_python_out=backend/python/diffusers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/coqui/ --grpc_python_out=backend/python/coqui/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/mamba/ --grpc_python_out=backend/python/mamba/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama2/ --grpc_python_out=backend/python/exllama2/ backend/backend.proto
+.PHONY: protogen-go-clean
+protogen-go-clean:
+	$(RM) pkg/grpc/proto/backend.pb.go pkg/grpc/proto/backend_grpc.pb.go
+	$(RM) bin/*
+
+.PHONY: protogen-python
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen
+
+.PHONY: protogen-python-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean
+
+.PHONY: autogptq-protogen
+autogptq-protogen:
+	$(MAKE) -C backend/python/autogptq protogen
+
+.PHONY: autogptq-protogen-clean
+autogptq-protogen-clean:
+	$(MAKE) -C backend/python/autogptq protogen-clean
+
+.PHONY: bark-protogen
+bark-protogen:
+	$(MAKE) -C backend/python/bark protogen
+
+.PHONY: bark-protogen-clean
+bark-protogen-clean:
+	$(MAKE) -C backend/python/bark protogen-clean
+
+.PHONY: coqui-protogen
+coqui-protogen:
+	$(MAKE) -C backend/python/coqui protogen
+
+.PHONY: coqui-protogen-clean
+coqui-protogen-clean:
+	$(MAKE) -C backend/python/coqui protogen-clean
+
+.PHONY: diffusers-protogen
+diffusers-protogen:
+	$(MAKE) -C backend/python/diffusers protogen
+
+.PHONY: diffusers-protogen-clean
+diffusers-protogen-clean:
+	$(MAKE) -C backend/python/diffusers protogen-clean
+
+.PHONY: exllama-protogen
+exllama-protogen:
+	$(MAKE) -C backend/python/exllama protogen
+
+.PHONY: exllama-protogen-clean
+exllama-protogen-clean:
+	$(MAKE) -C backend/python/exllama protogen-clean
+
+.PHONY: exllama2-protogen
+exllama2-protogen:
+	$(MAKE) -C backend/python/exllama2 protogen
+
+.PHONY: exllama2-protogen-clean
+exllama2-protogen-clean:
+	$(MAKE) -C backend/python/exllama2 protogen-clean
+
+.PHONY: mamba-protogen
+mamba-protogen:
+	$(MAKE) -C backend/python/mamba protogen
+
+.PHONY: mamba-protogen-clean
+mamba-protogen-clean:
+	$(MAKE) -C backend/python/mamba protogen-clean
+
+.PHONY: petals-protogen
+petals-protogen:
+	$(MAKE) -C backend/python/petals protogen
+
+.PHONY: petals-protogen-clean
+petals-protogen-clean:
+	$(MAKE) -C backend/python/petals protogen-clean
+
+.PHONY: rerankers-protogen
+rerankers-protogen:
+	$(MAKE) -C backend/python/rerankers protogen
+
+.PHONY: rerankers-protogen-clean
+rerankers-protogen-clean:
+	$(MAKE) -C backend/python/rerankers protogen-clean
+
+.PHONY: sentencetransformers-protogen
+sentencetransformers-protogen:
+	$(MAKE) -C backend/python/sentencetransformers protogen
+
+.PHONY: sentencetransformers-protogen-clean
+sentencetransformers-protogen-clean:
+	$(MAKE) -C backend/python/sentencetransformers protogen-clean
+
+.PHONY: transformers-protogen
+transformers-protogen:
+	$(MAKE) -C backend/python/transformers protogen
+
+.PHONY: transformers-protogen-clean
+transformers-protogen-clean:
+	$(MAKE) -C backend/python/transformers protogen-clean
+
+.PHONY: parler-tts-protogen
+parler-tts-protogen:
+	$(MAKE) -C backend/python/parler-tts protogen
+
+.PHONY: parler-tts-protogen-clean
+parler-tts-protogen-clean:
+	$(MAKE) -C backend/python/parler-tts protogen-clean
+
+.PHONY: transformers-musicgen-protogen
+transformers-musicgen-protogen:
+	$(MAKE) -C backend/python/transformers-musicgen protogen
+
+.PHONY: transformers-musicgen-protogen-clean
+transformers-musicgen-protogen-clean:
+	$(MAKE) -C backend/python/transformers-musicgen protogen-clean
+
+.PHONY: vall-e-x-protogen
+vall-e-x-protogen:
+	$(MAKE) -C backend/python/vall-e-x protogen
+
+.PHONY: vall-e-x-protogen-clean
+vall-e-x-protogen-clean:
+	$(MAKE) -C backend/python/vall-e-x protogen-clean
+
+.PHONY: vllm-protogen
+vllm-protogen:
+	$(MAKE) -C backend/python/vllm protogen
+
+.PHONY: vllm-protogen-clean
+vllm-protogen-clean:
+	$(MAKE) -C backend/python/vllm protogen-clean

 ## GRPC
 # Note: it is duplicated in the Dockerfile
-prepare-extra-conda-environments:
+prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
@@ -447,14 +572,16 @@ prepare-extra-conda-environments:
 	$(MAKE) -C backend/python/vllm
 	$(MAKE) -C backend/python/mamba
 	$(MAKE) -C backend/python/sentencetransformers
+	$(MAKE) -C backend/python/rerankers
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/transformers-musicgen
+	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/petals
 	$(MAKE) -C backend/python/exllama2

-prepare-test-extra:
+prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers

@@ -478,11 +605,11 @@ backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true

-backend-assets/grpc: replace
+backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/bert-embeddings: sources/go-bert sources/go-bert/libgobert.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
+backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/

 backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
@@ -524,17 +651,16 @@ ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif

-backend-assets/grpc/llama-ggml: sources/go-llama-ggml sources/go-llama-ggml/libbinding.a backend-assets/grpc
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
+backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/

 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/

-backend-assets/grpc/rwkv: sources/go-rwkv sources/go-rwkv/librwkv.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
+backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv

 backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
@@ -597,4 +723,4 @@ docker-image-intel-xpu:

 .PHONY: swagger
 swagger:
-	swag init -g core/http/api.go --output swagger
+	swag init -g core/http/app.go --output swagger
--- a/README.md
+++ b/README.md
@@ -44,23 +44,19 @@

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

 ## 🔥🔥 Hot topics / Roadmap

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

- Landing page: https://github.com/mudler/LocalAI/pull/1922
+- Reranker API: https://github.com/mudler/LocalAI/pull/2121
+- Gallery WebUI: https://github.com/mudler/LocalAI/pull/2104
+- llama3: https://github.com/mudler/LocalAI/discussions/2076
+- Parler-TTS: https://github.com/mudler/LocalAI/pull/2027
 - Openvino support: https://github.com/mudler/LocalAI/pull/1892
 - Vector store: https://github.com/mudler/LocalAI/pull/1795
 - All-in-one container image: https://github.com/mudler/LocalAI/issues/1855
- Parallel function calling: https://github.com/mudler/LocalAI/pull/1726 / Tools API support: https://github.com/mudler/LocalAI/pull/1715
- Upload file API: https://github.com/mudler/LocalAI/pull/1703
- ROCm container images: https://github.com/mudler/LocalAI/pull/1595 / Intel GPU support (sycl, transformers, diffusers): https://github.com/mudler/LocalAI/issues/1653
- Mamba support: https://github.com/mudler/LocalAI/pull/1589
- Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
- 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
- Img2vid https://github.com/mudler/LocalAI/pull/1442

 Hot topics (looking for contributors):
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
--- a/aio/cpu/rerank.yaml
+++ b/aio/cpu/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -6,15 +6,22 @@ parameters:
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
-    {{- if .Content}}
-    {{.Content}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
-    {{- if .FunctionCall }}</tool_call>{{end }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
-    <|im_end|>
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -29,8 +36,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
-    <|im_end|>
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -129,7 +129,7 @@ detect_gpu
 detect_gpu_size

 PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
-export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"

 check_vars

--- a/aio/gpu-8g/rerank.yaml
+++ b/aio/gpu-8g/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -6,15 +6,22 @@ parameters:
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
-    {{- if .Content}}
-    {{.Content}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
-    {{- if .FunctionCall }}</tool_call>{{end }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
-    <|im_end|>
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -29,8 +36,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
-    <|im_end|>
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/aio/intel/rerank.yaml
+++ b/aio/intel/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -7,15 +7,22 @@ parameters:
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
-    {{- if .Content}}
-    {{.Content}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
-    {{- if .FunctionCall }}</tool_call>{{end }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
-    <|im_end|>
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -30,8 +37,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
-    <|im_end|>
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -23,6 +23,30 @@ service Backend {
  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
  rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
+
+  rpc Rerank(RerankRequest) returns (RerankResult) {}
+}
+
+message RerankRequest {
+  string query = 1;
+  repeated string documents = 2;
+  int32 top_n = 3;
+}
+
+message RerankResult {
+  Usage usage = 1;
+  repeated DocumentResult results = 2;
+}
+
+message Usage {
+  int32 total_tokens = 1;
+  int32 prompt_tokens = 2;
+}
+
+message DocumentResult {
+  int32 index = 1;
+  string text = 2;
+  float relevance_score = 3;
 }

 message StoresKey {
@@ -107,11 +131,15 @@ message PredictOptions {
  string NegativePrompt = 40;
  int32 NDraft = 41;
  repeated string Images = 42;
+  bool UseTokenizerTemplate = 43;
+  repeated Message Messages = 44;
 }

 // The response message containing the result
 message Reply {
  bytes message = 1;
+  int32 tokens = 2;
+  int32 prompt_tokens = 3;
 }

 message ModelOptions {
@@ -173,6 +201,7 @@ message ModelOptions {
  bool   EnforceEager = 52;
  int32  SwapSpace = 53;
  int32  MaxModelLen = 54;
+  int32  TensorParallelSize = 55;

  string MMProj = 41;

@@ -256,3 +285,8 @@ message StatusResponse {
  State state = 1;
  MemoryUsageData memory = 2;
 }
+
+message Message {
+  string role = 1;
+  string content = 2;
+}
--- a/backend/backend_grpc.pb.go
+++ b/backend/backend_grpc.pb.go
@@ -1,457 +0,0 @@
-// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
-// versions:
-// - protoc-gen-go-grpc v1.2.0
-// - protoc             v4.23.4
-// source: backend/backend.proto
-
-package proto
-
-import (
-	context "context"
-	grpc "google.golang.org/grpc"
-	codes "google.golang.org/grpc/codes"
-	status "google.golang.org/grpc/status"
-)
-
-// This is a compile-time assertion to ensure that this generated file
-// is compatible with the grpc package it is being compiled against.
-// Requires gRPC-Go v1.32.0 or later.
-const _ = grpc.SupportPackageIsVersion7
-
-// BackendClient is the client API for Backend service.
-//
-// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
-type BackendClient interface {
-	Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error)
-	Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error)
-	LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error)
-	PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error)
-	Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error)
-	GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error)
-	AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error)
-	TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error)
-	TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error)
-	Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error)
-}
-
-type backendClient struct {
-	cc grpc.ClientConnInterface
-}
-
-func NewBackendClient(cc grpc.ClientConnInterface) BackendClient {
-	return &backendClient{cc}
-}
-
-func (c *backendClient) Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error) {
-	out := new(Reply)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Health", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error) {
-	out := new(Reply)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Predict", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/LoadModel", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error) {
-	stream, err := c.cc.NewStream(ctx, &Backend_ServiceDesc.Streams[0], "/backend.Backend/PredictStream", opts...)
-	if err != nil {
-		return nil, err
-	}
-	x := &backendPredictStreamClient{stream}
-	if err := x.ClientStream.SendMsg(in); err != nil {
-		return nil, err
-	}
-	if err := x.ClientStream.CloseSend(); err != nil {
-		return nil, err
-	}
-	return x, nil
-}
-
-type Backend_PredictStreamClient interface {
-	Recv() (*Reply, error)
-	grpc.ClientStream
-}
-
-type backendPredictStreamClient struct {
-	grpc.ClientStream
-}
-
-func (x *backendPredictStreamClient) Recv() (*Reply, error) {
-	m := new(Reply)
-	if err := x.ClientStream.RecvMsg(m); err != nil {
-		return nil, err
-	}
-	return m, nil
-}
-
-func (c *backendClient) Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error) {
-	out := new(EmbeddingResult)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Embedding", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/GenerateImage", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error) {
-	out := new(TranscriptResult)
-	err := c.cc.Invoke(ctx, "/backend.Backend/AudioTranscription", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/TTS", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error) {
-	out := new(TokenizationResponse)
-	err := c.cc.Invoke(ctx, "/backend.Backend/TokenizeString", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error) {
-	out := new(StatusResponse)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Status", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-// BackendServer is the server API for Backend service.
-// All implementations must embed UnimplementedBackendServer
-// for forward compatibility
-type BackendServer interface {
-	Health(context.Context, *HealthMessage) (*Reply, error)
-	Predict(context.Context, *PredictOptions) (*Reply, error)
-	LoadModel(context.Context, *ModelOptions) (*Result, error)
-	PredictStream(*PredictOptions, Backend_PredictStreamServer) error
-	Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error)
-	GenerateImage(context.Context, *GenerateImageRequest) (*Result, error)
-	AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error)
-	TTS(context.Context, *TTSRequest) (*Result, error)
-	TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error)
-	Status(context.Context, *HealthMessage) (*StatusResponse, error)
-	mustEmbedUnimplementedBackendServer()
-}
-
-// UnimplementedBackendServer must be embedded to have forward compatible implementations.
-type UnimplementedBackendServer struct {
-}
-
-func (UnimplementedBackendServer) Health(context.Context, *HealthMessage) (*Reply, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Health not implemented")
-}
-func (UnimplementedBackendServer) Predict(context.Context, *PredictOptions) (*Reply, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Predict not implemented")
-}
-func (UnimplementedBackendServer) LoadModel(context.Context, *ModelOptions) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method LoadModel not implemented")
-}
-func (UnimplementedBackendServer) PredictStream(*PredictOptions, Backend_PredictStreamServer) error {
-	return status.Errorf(codes.Unimplemented, "method PredictStream not implemented")
-}
-func (UnimplementedBackendServer) Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Embedding not implemented")
-}
-func (UnimplementedBackendServer) GenerateImage(context.Context, *GenerateImageRequest) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method GenerateImage not implemented")
-}
-func (UnimplementedBackendServer) AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method AudioTranscription not implemented")
-}
-func (UnimplementedBackendServer) TTS(context.Context, *TTSRequest) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method TTS not implemented")
-}
-func (UnimplementedBackendServer) TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method TokenizeString not implemented")
-}
-func (UnimplementedBackendServer) Status(context.Context, *HealthMessage) (*StatusResponse, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Status not implemented")
-}
-func (UnimplementedBackendServer) mustEmbedUnimplementedBackendServer() {}
-
-// UnsafeBackendServer may be embedded to opt out of forward compatibility for this service.
-// Use of this interface is not recommended, as added methods to BackendServer will
-// result in compilation errors.
-type UnsafeBackendServer interface {
-	mustEmbedUnimplementedBackendServer()
-}
-
-func RegisterBackendServer(s grpc.ServiceRegistrar, srv BackendServer) {
-	s.RegisterService(&Backend_ServiceDesc, srv)
-}
-
-func _Backend_Health_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(HealthMessage)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Health(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Health",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Health(ctx, req.(*HealthMessage))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_Predict_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Predict(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Predict",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Predict(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_LoadModel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(ModelOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).LoadModel(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/LoadModel",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).LoadModel(ctx, req.(*ModelOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_PredictStream_Handler(srv interface{}, stream grpc.ServerStream) error {
-	m := new(PredictOptions)
-	if err := stream.RecvMsg(m); err != nil {
-		return err
-	}
-	return srv.(BackendServer).PredictStream(m, &backendPredictStreamServer{stream})
-}
-
-type Backend_PredictStreamServer interface {
-	Send(*Reply) error
-	grpc.ServerStream
-}
-
-type backendPredictStreamServer struct {
-	grpc.ServerStream
-}
-
-func (x *backendPredictStreamServer) Send(m *Reply) error {
-	return x.ServerStream.SendMsg(m)
-}
-
-func _Backend_Embedding_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Embedding(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Embedding",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Embedding(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_GenerateImage_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(GenerateImageRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).GenerateImage(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/GenerateImage",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).GenerateImage(ctx, req.(*GenerateImageRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_AudioTranscription_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(TranscriptRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).AudioTranscription(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/AudioTranscription",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).AudioTranscription(ctx, req.(*TranscriptRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_TTS_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(TTSRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).TTS(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/TTS",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).TTS(ctx, req.(*TTSRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_TokenizeString_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).TokenizeString(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/TokenizeString",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).TokenizeString(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_Status_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(HealthMessage)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Status(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Status",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Status(ctx, req.(*HealthMessage))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-// Backend_ServiceDesc is the grpc.ServiceDesc for Backend service.
-// It's only intended for direct use with grpc.RegisterService,
-// and not to be introspected or modified (even as a copy)
-var Backend_ServiceDesc = grpc.ServiceDesc{
-	ServiceName: "backend.Backend",
-	HandlerType: (*BackendServer)(nil),
-	Methods: []grpc.MethodDesc{
-		{
-			MethodName: "Health",
-			Handler:    _Backend_Health_Handler,
-		},
-		{
-			MethodName: "Predict",
-			Handler:    _Backend_Predict_Handler,
-		},
-		{
-			MethodName: "LoadModel",
-			Handler:    _Backend_LoadModel_Handler,
-		},
-		{
-			MethodName: "Embedding",
-			Handler:    _Backend_Embedding_Handler,
-		},
-		{
-			MethodName: "GenerateImage",
-			Handler:    _Backend_GenerateImage_Handler,
-		},
-		{
-			MethodName: "AudioTranscription",
-			Handler:    _Backend_AudioTranscription_Handler,
-		},
-		{
-			MethodName: "TTS",
-			Handler:    _Backend_TTS_Handler,
-		},
-		{
-			MethodName: "TokenizeString",
-			Handler:    _Backend_TokenizeString_Handler,
-		},
-		{
-			MethodName: "Status",
-			Handler:    _Backend_Status_Handler,
-		},
-	},
-	Streams: []grpc.StreamDesc{
-		{
-			StreamName:    "PredictStream",
-			Handler:       _Backend_PredictStream_Handler,
-			ServerStreams: true,
-		},
-	},
-	Metadata: "backend/backend.proto",
-}
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -5,7 +5,6 @@ SYSTEM ?= $(HOST_SYSTEM)
 TAG_LIB_GRPC?=v1.59.0
 GIT_REPO_LIB_GRPC?=https://github.com/grpc/grpc.git
 GIT_CLONE_DEPTH?=1
-NUM_BUILD_THREADS?=$(shell nproc --ignore=1)

 INSTALLED_PACKAGES=installed_packages
 GRPC_REPO=grpc_repo
@@ -52,7 +51,7 @@ $(GRPC_REPO):

 $(GRPC_BUILD): $(GRPC_REPO)
 	mkdir -p $(GRPC_BUILD)
-	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . -- -j ${NUM_BUILD_THREADS} && cmake --build . --target install -- -j ${NUM_BUILD_THREADS}
+	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . && cmake --build . --target install

 build: $(INSTALLED_PACKAGES)

--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -2332,6 +2332,10 @@ public:
                std::string completion_text = result.result_json.value("content", "");

                reply.set_message(completion_text);
+                int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
+                reply.set_tokens(tokens_predicted);
+                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
+                reply.set_prompt_tokens(tokens_evaluated);

                // Send the reply
                writer->Write(reply);
@@ -2357,6 +2361,10 @@ public:
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
            completion_text = result.result_json.value("content", "");
+            int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
+            int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
+            reply->set_prompt_tokens(tokens_evaluated);
+            reply->set_tokens(tokens_predicted);
            reply->set_message(completion_text);
        }
        else
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,4 +1,13 @@
 .PHONY: autogptq
-autogptq:
+autogptq: protogen
 	$(MAKE) -C ../common-env/transformers

+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/autogptq/backend_pb2.py
+++ b/backend/python/autogptq/backend_pb2.py
--- a/backend/python/autogptq/backend_pb2_grpc.py
+++ b/backend/python/autogptq/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/bark/Makefile
+++ b/backend/python/bark/Makefile
@@ -1,15 +1,25 @@
 .PHONY: ttsbark
-ttsbark:
+ttsbark: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running bark..."
 	bash run.sh
 	@echo "bark run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing bark..."
 	bash test.sh
 	@echo "bark tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/bark/backend_pb2.py
+++ b/backend/python/bark/backend_pb2.py
--- a/backend/python/bark/backend_pb2_grpc.py
+++ b/backend/python/bark/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -2,6 +2,7 @@
 set -ex

 SKIP_CONDA=${SKIP_CONDA:-0}
+REQUIREMENTS_FILE=$1

 # Check if environment exist
 conda_env_exists(){
@@ -14,7 +15,7 @@ else
    export PATH=$PATH:/opt/conda/bin
    if conda_env_exists "transformers" ; then
        echo "Creating virtual environment..."
-        conda env create --name transformers --file $1
+        conda env create --name transformers --file $REQUIREMENTS_FILE
        echo "Virtual environment created."
    else 
        echo "Virtual environment already exists."
@@ -28,11 +29,16 @@ if [ -d "/opt/intel" ]; then
    pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed optimum[openvino]
 fi

-if [ "$PIP_CACHE_PURGE" = true ] ; then
-    if [ $SKIP_CONDA -eq 0 ]; then
-        # Activate conda environment
-        source activate transformers
-    fi
+# If we didn't skip conda, activate the environment
+# to install FlashAttention
+if [ $SKIP_CONDA -eq 0 ]; then
+    source activate transformers
+fi
+if [[ $REQUIREMENTS_FILE =~ -nvidia.yml$ ]]; then
+    #TODO: FlashAttention is supported on nvidia and ROCm, but ROCm install can't be done this easily
+    pip install flash-attn --no-build-isolation
+fi

+if [ "$PIP_CACHE_PURGE" = true ] ; then
    pip cache purge
 fi
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -116,8 +116,10 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1  
+      - rerankers[transformers]
+      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers-rocm.yml
+++ b/backend/python/common-env/transformers/transformers-rocm.yml
@@ -104,8 +104,10 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1
+      - rerankers[transformers]
+      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers.yml
+++ b/backend/python/common-env/transformers/transformers.yml
@@ -108,8 +108,10 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
-      - xformers==0.0.23.post1  
+      - xformers==0.0.23.post1
+      - rerankers[transformers]
+      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/coqui/Makefile
+++ b/backend/python/coqui/Makefile
@@ -1,15 +1,25 @@
 .PHONY: coqui
-coqui:
+coqui: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running coqui..."
 	bash run.sh
 	@echo "coqui run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing coqui..."
 	bash test.sh
 	@echo "coqui tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/coqui/backend_pb2.py
+++ b/backend/python/coqui/backend_pb2.py
--- a/backend/python/coqui/backend_pb2_grpc.py
+++ b/backend/python/coqui/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -12,15 +12,25 @@ export SKIP_CONDA=1
 endif

 .PHONY: diffusers
-diffusers:
+diffusers: protogen
 	@echo "Installing $(CONDA_ENV_PATH)..."
 	bash install.sh $(CONDA_ENV_PATH)

 .PHONY: run
-run:
+run: protogen
 	@echo "Running diffusers..."
 	bash run.sh
 	@echo "Diffusers run."

-test:
+test: protogen
 	bash test.sh
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/diffusers/backend_pb2.py
+++ b/backend/python/diffusers/backend_pb2.py
--- a/backend/python/diffusers/backend_pb2_grpc.py
+++ b/backend/python/diffusers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/diffusers/diffusers-rocm.yml
+++ b/backend/python/diffusers/diffusers-rocm.yml
@@ -61,4 +61,5 @@ dependencies:
      - urllib3==2.0.6
      - zipp==3.17.0
      - torch
+      - opencv-python
 prefix: /opt/conda/envs/diffusers
--- a/backend/python/diffusers/diffusers.yml
+++ b/backend/python/diffusers/diffusers.yml
@@ -71,4 +71,5 @@ dependencies:
      - typing-extensions==4.8.0
      - urllib3==2.0.6
      - zipp==3.17.0
+      - opencv-python
 prefix: /opt/conda/envs/diffusers
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -1,11 +1,21 @@
 export CONDA_ENV_PATH = "exllama.yml"

 .PHONY: exllama
-exllama:
+exllama: protogen
 	bash install.sh ${CONDA_ENV_PATH}

 .PHONY: run
-run:
+run: protogen
 	@echo "Running exllama..."
 	bash run.sh
 	@echo "exllama run."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/exllama/backend_pb2.py
+++ b/backend/python/exllama/backend_pb2.py
--- a/backend/python/exllama/backend_pb2_grpc.py
+++ b/backend/python/exllama/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -1,10 +1,20 @@
 .PHONY: exllama2
-exllama2:
+exllama2: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh

 .PHONY: run
-run:
+run: protogen
 	@echo "Running exllama2..."
 	bash run.sh
 	@echo "exllama2 run."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/exllama2/backend_pb2.py
+++ b/backend/python/exllama2/backend_pb2.py
--- a/backend/python/exllama2/backend_pb2_grpc.py
+++ b/backend/python/exllama2/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/mamba/Makefile
+++ b/backend/python/mamba/Makefile
@@ -1,16 +1,26 @@
 .PHONY: mamba
-mamba:
+mamba: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh

 .PHONY: run
-run:
+run: protogen
 	@echo "Running mamba..."
 	bash run.sh
 	@echo "mamba run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing mamba..."
 	bash test.sh
-	@echo "mamba tested."
+	@echo "mamba tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/mamba/backend_pb2.py
+++ b/backend/python/mamba/backend_pb2.py
--- a/backend/python/mamba/backend_pb2_grpc.py
+++ b/backend/python/mamba/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/parler-tts/Makefile
+++ b/backend/python/parler-tts/Makefile
@@ -0,0 +1,39 @@
+export CONDA_ENV_PATH = "parler.yml"
+SKIP_CONDA?=0
+ifeq ($(BUILD_TYPE), cublas)
+export CONDA_ENV_PATH = "parler-nvidia.yml"
+endif
+
+# Intel GPU are supposed to have dependencies installed in the main python
+# environment, so we skip conda installation for SYCL builds.
+# https://github.com/intel/intel-extension-for-pytorch/issues/538
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
+.PHONY: parler-tts
+parler-tts: protogen
+	@echo "Installing $(CONDA_ENV_PATH)..."
+	bash install.sh $(CONDA_ENV_PATH)
+
+.PHONY: run
+run: protogen
+	@echo "Running transformers..."
+	bash run.sh
+	@echo "transformers run."
+
+.PHONY: test
+test: protogen
+	@echo "Testing transformers..."
+	bash test.sh
+	@echo "transformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+set -ex
+
+SKIP_CONDA=${SKIP_CONDA:-0}
+
+# Check if environment exist
+conda_env_exists(){
+    ! conda list --name "${@}" >/dev/null 2>/dev/null
+}
+
+if [ $SKIP_CONDA -eq 1 ]; then
+    echo "Skipping conda environment installation"
+else
+    export PATH=$PATH:/opt/conda/bin
+    if conda_env_exists "parler" ; then
+        echo "Creating virtual environment..."
+        conda env create --name parler --file $1
+        echo "Virtual environment created."
+    else 
+        echo "Virtual environment already exists."
+    fi
+fi
+
+if [ $SKIP_CONDA -ne 1 ]; then
+    # Activate conda environment
+    source activate parler
+    # https://github.com/descriptinc/audiotools/issues/101
+    # incompatible protobuf versions.
+    curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o $CONDA_PREFIX/lib/python3.11/site-packages/google/protobuf/internal/builder.py
+fi
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    if [ $SKIP_CONDA -ne 1 ]; then
+        # Activate conda environment
+        source activate parler
+    fi
+
+    pip cache purge
+fi
--- a/backend/python/parler-tts/parler-nvidia.yml
+++ b/backend/python/parler-tts/parler-nvidia.yml
@@ -0,0 +1,48 @@
+name: parler
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.08.22=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.11=h7f8727e_2
+  - pip=23.2.1=py311h06a4308_0
+  - python=3.11.5=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=68.0.0=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - tzdata=2023c=h04d1e81_0
+  - wheel=0.41.2=py311h06a4308_0
+  - xz=5.4.2=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+      - accelerate>=0.11.0
+      - grpcio==1.59.0
+      - numpy==1.26.0
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.18.1
+      - nvidia-nvjitlink-cu12==12.2.140
+      - nvidia-nvtx-cu12==12.1.105
+      - torch==2.1.0
+      - transformers>=4.34.0
+      - descript-audio-codec
+      - sentencepiece
+      - git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
+prefix: /opt/conda/envs/diffusers
--- a/backend/python/parler-tts/parler.yml
+++ b/backend/python/parler-tts/parler.yml
@@ -0,0 +1,36 @@
+name: parler
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.08.22=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.11=h7f8727e_2
+  - pip=23.2.1=py311h06a4308_0
+  - python=3.11.5=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=68.0.0=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - tzdata=2023c=h04d1e81_0
+  - wheel=0.41.2=py311h06a4308_0
+  - xz=5.4.2=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+      - accelerate>=0.11.0
+      - numpy==1.26.0
+      - grpcio==1.59.0
+      - torch==2.1.0
+      - transformers>=4.34.0
+      - descript-audio-codec
+      - sentencepiece
+      - git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
+prefix: /opt/conda/envs/parler
--- a/backend/python/parler-tts/parler_tts_server.py
+++ b/backend/python/parler-tts/parler_tts_server.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+Extra gRPC server for MusicgenForConditionalGeneration models.
+"""
+from concurrent import futures
+
+import argparse
+import signal
+import sys
+import os
+
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+from scipy.io.wavfile import write as write_wav
+
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer
+import soundfile as sf  
+import torch
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer for the backend service.
+
+    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
+    """
+    def Health(self, request, context):
+        """
+        A gRPC method that returns the health status of the backend service.
+
+        Args:
+            request: A HealthRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Reply object that contains the health status of the backend service.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    def LoadModel(self, request, context):
+        """
+        A gRPC method that loads a model into memory.
+
+        Args:
+            request: A LoadModelRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Result object that contains the result of the LoadModel operation.
+        """
+        model_name = request.Model
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        try:
+            self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def TTS(self, request, context):
+        model_name = request.model
+        voice = request.voice
+        if voice == "":
+            voice = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
+        if model_name == "":
+            return backend_pb2.Result(success=False, message="request.model is required")
+        try:
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            input_ids = self.tokenizer(voice, return_tensors="pt").input_ids.to(device)
+            prompt_input_ids = self.tokenizer(request.text, return_tensors="pt").input_ids.to(device)
+           
+            generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+            audio_arr = generation.cpu().numpy().squeeze()
+            print("[parler-tts] TTS generated!", file=sys.stderr)
+            sf.write(request.dst, audio_arr, self.model.config.sampling_rate)
+            print("[parler-tts] TTS saved to", request.dst, file=sys.stderr)
+            print("[parler-tts] TTS for", file=sys.stderr)
+            print(request, file=sys.stderr)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(success=True)
+
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("[parler-tts] Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("[parler-tts] Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+    print(f"[parler-tts] startup: {args}", file=sys.stderr)
+    serve(args.addr)
--- a/backend/python/parler-tts/run.sh
+++ b/backend/python/parler-tts/run.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+##
+## A bash script wrapper that runs the parler-tts server with conda
+
+echo "Launching gRPC server for parler-tts"
+
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate parler
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python $DIR/parler_tts_server.py $@
--- a/backend/python/parler-tts/test.sh
+++ b/backend/python/parler-tts/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+##
+## A bash script wrapper that runs the transformers server with conda
+
+# Activate conda environment
+source activate parler
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python -m unittest $DIR/test_parler.py
--- a/backend/python/parler-tts/test_parler.py
+++ b/backend/python/parler-tts/test_parler.py
@@ -0,0 +1,81 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "parler_tts_server.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_tts(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
+                self.assertTrue(response.success)
+                tts_request = backend_pb2.TTSRequest(text="Hey, how are you doing today?")
+                tts_response = stub.TTS(tts_request)
+                self.assertIsNotNone(tts_response)
+        except Exception as err:
+            print(err)
+            self.fail("TTS service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/petals/Makefile
+++ b/backend/python/petals/Makefile
@@ -1,17 +1,27 @@
 .PHONY: petals
-petals:
+petals: protogen
 	@echo "Creating virtual environment..."
 	bash install.sh "petals.yml"
 	@echo "Virtual environment created."

 .PHONY: run
-run:
+run: protogen
 	@echo "Running petals..."
 	bash run.sh
 	@echo "petals run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing petals..."
 	bash test.sh
 	@echo "petals tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/petals/backend_pb2.py
+++ b/backend/python/petals/backend_pb2.py
--- a/backend/python/petals/backend_pb2_grpc.py
+++ b/backend/python/petals/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/rerankers/Makefile
+++ b/backend/python/rerankers/Makefile
@@ -0,0 +1,27 @@
+.PHONY: rerankers
+rerankers: protogen
+	$(MAKE) -C ../common-env/transformers
+
+
+.PHONY: run
+run: protogen
+	@echo "Running rerankers..."
+	bash run.sh
+	@echo "rerankers run."
+
+# It is not working well by using command line. It only6 works with IDE like VSCode.
+.PHONY: test
+test: protogen
+	@echo "Testing rerankers..."
+	bash test.sh
+	@echo "rerankers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/rerankers/README.md
+++ b/backend/python/rerankers/README.md
@@ -0,0 +1,5 @@
+# Creating a separate environment for the reranker project
+
+```
+make reranker
+```
--- a/backend/python/rerankers/reranker.py
+++ b/backend/python/rerankers/reranker.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""
+Extra gRPC server for Rerankers models.
+"""
+from concurrent import futures
+
+import argparse
+import signal
+import sys
+import os
+
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+from rerankers import Reranker
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer for the backend service.
+
+    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
+    """
+    def Health(self, request, context):
+        """
+        A gRPC method that returns the health status of the backend service.
+
+        Args:
+            request: A HealthRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Reply object that contains the health status of the backend service.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    def LoadModel(self, request, context):
+        """
+        A gRPC method that loads a model into memory.
+
+        Args:
+            request: A LoadModelRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Result object that contains the result of the LoadModel operation.
+        """
+        model_name = request.Model
+        try:
+            kwargs = {}
+            if request.Type != "":
+                kwargs['model_type'] = request.Type
+            if request.PipelineType != "": # Reuse the PipelineType field for language
+                kwargs['lang'] = request.PipelineType
+            self.model_name = model_name
+            self.model = Reranker(model_name, **kwargs)  
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        # Implement your logic here for the LoadModel service
+        # Replace this with your desired response
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def Rerank(self, request, context):
+        documents = []
+        for idx, doc in enumerate(request.documents):
+            documents.append(doc)
+        ranked_results=self.model.rank(query=request.query, docs=documents, doc_ids=list(range(len(request.documents))))
+        # Prepare results to return
+        results = [
+            backend_pb2.DocumentResult(
+                index=res.doc_id,
+                text=res.text,
+                relevance_score=res.score
+            ) for res in ranked_results.results
+        ]
+
+        # Calculate the usage and total tokens
+        # TODO: Implement the usage calculation with reranker
+        total_tokens = sum(len(doc.split()) for doc in request.documents) + len(request.query.split())
+        prompt_tokens = len(request.query.split())
+        usage = backend_pb2.Usage(total_tokens=total_tokens, prompt_tokens=prompt_tokens)
+        return backend_pb2.RerankResult(usage=usage, results=results)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/backend/python/rerankers/run.sh
+++ b/backend/python/rerankers/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+##
+## A bash script wrapper that runs the reranker server with conda
+
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate transformers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python $DIR/reranker.py $@
--- a/backend/python/rerankers/test.sh
+++ b/backend/python/rerankers/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+##
+## A bash script wrapper that runs the reranker server with conda
+
+# Activate conda environment
+source activate transformers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python -m unittest $DIR/test_reranker.py
--- a/backend/python/rerankers/test_reranker.py
+++ b/backend/python/rerankers/test_reranker.py
@@ -0,0 +1,90 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "reranker.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.kill()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="cross-encoder"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_rerank(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                request = backend_pb2.RerankRequest(
+                    query="I love you",
+                    documents=["I hate you", "I really like you"],
+                    top_n=2
+                )
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="cross-encoder"))
+                self.assertTrue(response.success)
+               
+                rerank_response = stub.Rerank(request)
+                print(rerank_response.results[0])
+                self.assertIsNotNone(rerank_response.results)
+                self.assertEqual(len(rerank_response.results), 2)
+                self.assertEqual(rerank_response.results[0].text, "I really like you")
+                self.assertEqual(rerank_response.results[1].text, "I hate you")
+        except Exception as err:
+            print(err)
+            self.fail("Reranker service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/sentencetransformers/Makefile
+++ b/backend/python/sentencetransformers/Makefile
@@ -1,17 +1,27 @@
 .PHONY: sentencetransformers
-sentencetransformers:
+sentencetransformers: protogen
 	$(MAKE) -C ../common-env/transformers


 .PHONY: run
-run:
+run: protogen
 	@echo "Running sentencetransformers..."
 	bash run.sh
 	@echo "sentencetransformers run."

 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing sentencetransformers..."
 	bash test.sh
 	@echo "sentencetransformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/sentencetransformers/backend_pb2.py
+++ b/backend/python/sentencetransformers/backend_pb2.py
--- a/backend/python/sentencetransformers/backend_pb2_grpc.py
+++ b/backend/python/sentencetransformers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers-musicgen/Makefile
+++ b/backend/python/transformers-musicgen/Makefile
@@ -1,16 +1,25 @@
-
 .PHONY: transformers-musicgen
-transformers-musicgen:
+transformers-musicgen: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/transformers-musicgen/backend_pb2.py
+++ b/backend/python/transformers-musicgen/backend_pb2.py
--- a/backend/python/transformers-musicgen/backend_pb2_grpc.py
+++ b/backend/python/transformers-musicgen/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers-musicgen/run.sh
+++ b/backend/python/transformers-musicgen/run.sh
@@ -8,7 +8,7 @@ echo "Launching gRPC server for transformers-musicgen"
 export PATH=$PATH:/opt/conda/bin

 # Activate conda environment
-source activate transformers-musicgen
+source activate transformers

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/transformers/Makefile
+++ b/backend/python/transformers/Makefile
@@ -1,16 +1,26 @@
 .PHONY: transformers
-transformers:
+transformers: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."

 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/transformers/backend_pb2.py
+++ b/backend/python/transformers/backend_pb2.py
--- a/backend/python/transformers/backend_pb2_grpc.py
+++ b/backend/python/transformers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -148,7 +148,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                else:
                    device_map="CPU"
                self.model = OVModelForCausalLM.from_pretrained(model_name, 
-                                                                compile=True, 
+                                                                compile=True,
+                                                                trust_remote_code=request.TrustRemoteCode,
+                                                                ov_config={"PERFORMANCE_HINT": "LATENCY"}, 
                                                                device=device_map)
                self.OV = True
            else:
@@ -158,6 +160,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                                                       quantization_config=quantization, 
                                                       device_map=device_map, 
                                                       torch_dtype=compute)
+            if request.ContextSize > 0:
+                self.max_tokens = request.ContextSize
+            else:
+                self.max_tokens = self.model.config.max_position_embeddings
+ 
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
            self.XPU = False

@@ -212,12 +219,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        set_seed(request.Seed)
        if request.TopP == 0:
            request.TopP = 0.9
+        
+        if request.TopK == 0:
+            request.TopK = 40
+
+        prompt = request.Prompt
+        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:    
+            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
+
+        eos_token_id = self.tokenizer.eos_token_id
+        if request.StopPrompts:
+            eos_token_id = []
+            for word in request.StopPrompts:
+                eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))
+
+        inputs = self.tokenizer(prompt, return_tensors="pt")

-        max_tokens = 200
        if request.Tokens > 0:
            max_tokens = request.Tokens
+        else:
+            max_tokens = self.max_tokens - inputs["input_ids"].size()[inputs["input_ids"].dim()-1]

-        inputs = self.tokenizer(request.Prompt, return_tensors="pt")
        if self.CUDA:
            inputs = inputs.to("cuda")
        if XPU and self.OV == False:
@@ -235,7 +257,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        top_k=request.TopK, 
                        do_sample=True,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=self.tokenizer.eos_token_id,
+                        eos_token_id=eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id,
                        streamer=streamer)
            thread=Thread(target=self.model.generate, kwargs=config)
@@ -264,7 +286,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        top_k=request.TopK, 
                        do_sample=True,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=self.tokenizer.eos_token_id,
+                        eos_token_id=eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id)
            generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]

--- a/backend/python/vall-e-x/Makefile
+++ b/backend/python/vall-e-x/Makefile
@@ -3,18 +3,28 @@ export SKIP_CONDA=1
 endif

 .PHONY: ttsvalle
-ttsvalle:
+ttsvalle: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh

 .PHONY: run
-run:
+run: protogen
 	@echo "Running ttsvalle..."
 	bash run.sh
 	@echo "ttsvalle run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing valle..."
 	bash test.sh
 	@echo "valle tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/vall-e-x/backend_pb2.py
+++ b/backend/python/vall-e-x/backend_pb2.py
--- a/backend/python/vall-e-x/backend_pb2_grpc.py
+++ b/backend/python/vall-e-x/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/vllm/Makefile
+++ b/backend/python/vllm/Makefile
@@ -1,15 +1,25 @@
 .PHONY: vllm
-vllm:
+vllm: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running vllm..."
 	bash run.sh
 	@echo "vllm run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing vllm..."
 	bash test.sh
-	@echo "vllm tested."
+	@echo "vllm tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/vllm/backend_pb2.py
+++ b/backend/python/vllm/backend_pb2.py
--- a/backend/python/vllm/backend_pb2_grpc.py
+++ b/backend/python/vllm/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/vllm/backend_vllm.py
+++ b/backend/python/vllm/backend_vllm.py
@@ -14,6 +14,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
+from vllm.transformers_utils.tokenizer import get_tokenizer

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@@ -71,7 +72,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        """
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))

-    def LoadModel(self, request, context):
+    async def LoadModel(self, request, context):
        """
        Loads a language model.

@@ -94,6 +95,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            engine_args.trust_remote_code = request.TrustRemoteCode
        if request.EnforceEager:
            engine_args.enforce_eager = request.EnforceEager
+        if request.TensorParallelSize:
+            engine_args.tensor_parallel_size = request.TensorParallelSize
        if request.SwapSpace != 0:
            engine_args.swap_space = request.SwapSpace
        if request.MaxModelLen != 0:
@@ -103,6 +106,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        try:
+           engine_model_config = await self.llm.get_model_config()
+           self.tokenizer = get_tokenizer(
+               engine_model_config.tokenizer,
+               tokenizer_mode=engine_model_config.tokenizer_mode,
+               trust_remote_code=engine_model_config.trust_remote_code,
+               truncation_side="left",
+           )
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
        return backend_pb2.Result(message="Model loaded successfully", success=True)

    async def Predict(self, request, context):
@@ -161,9 +176,15 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if request.Seed != 0:
            sampling_params.seed = request.Seed

+        prompt = request.Prompt
+        
+        # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
+        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
+            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
+
        # Generate text
        request_id = random_uuid()
-        outputs = self.llm.generate(request.Prompt, sampling_params, request_id)
+        outputs = self.llm.generate(prompt, sampling_params, request_id)

        # Stream the results
        generated_text = ""
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -2,6 +2,7 @@ package backend

 import (
 	"context"
+	"fmt"
 	"os"
 	"regexp"
 	"strings"
@@ -9,9 +10,11 @@ import (
 	"unicode/utf8"

 	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"

 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/grpc"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )
@@ -26,7 +29,7 @@ type TokenUsage struct {
 	Completion int
 }

-func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 	threads := c.Threads
 	if *threads == 0 && o.Threads != 0 {
@@ -71,10 +74,30 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode
 		return nil, err
 	}

+	var protoMessages []*proto.Message
+	// if we are using the tokenizer template, we need to convert the messages to proto messages
+	// unless the prompt has already been tokenized (non-chat endpoints + functions)
+	if c.TemplateConfig.UseTokenizerTemplate && s == "" {
+		protoMessages = make([]*proto.Message, len(messages), len(messages))
+		for i, message := range messages {
+			protoMessages[i] = &proto.Message{
+				Role: message.Role,
+			}
+			switch ct := message.Content.(type) {
+			case string:
+				protoMessages[i].Content = ct
+			default:
+				return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
+			}
+		}
+	}
+
 	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
 	fn := func() (LLMResponse, error) {
 		opts := gRPCPredictOpts(c, loader.ModelPath)
 		opts.Prompt = s
+		opts.Messages = protoMessages
+		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
 		opts.Images = images

 		tokenUsage := TokenUsage{}
@@ -130,6 +153,12 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode
 			if err != nil {
 				return LLMResponse{}, err
 			}
+			if tokenUsage.Prompt == 0 {
+				tokenUsage.Prompt = int(reply.PromptTokens)
+			}
+			if tokenUsage.Completion == 0 {
+				tokenUsage.Completion = int(reply.Tokens)
+			}
 			return LLMResponse{
 				Response: string(reply.Message),
 				Usage:    tokenUsage,
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -74,6 +74,7 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		EnforceEager:         c.EnforceEager,
 		SwapSpace:            int32(c.SwapSpace),
 		MaxModelLen:          int32(c.MaxModelLen),
+		TensorParallelSize:   int32(c.TensorParallelSize),
 		MMProj:               c.MMProj,
 		YarnExtFactor:        c.YarnExtFactor,
 		YarnAttnFactor:       c.YarnAttnFactor,
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -0,0 +1,39 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
+	bb := backend
+	if bb == "" {
+		return nil, fmt.Errorf("backend is required")
+	}
+
+	grpcOpts := gRPCModelOpts(backendConfig)
+
+	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
+		model.WithBackendString(bb),
+		model.WithModel(modelFile),
+		model.WithContext(appConfig.Context),
+		model.WithAssetDir(appConfig.AssetsDestination),
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
+	})
+	rerankModel, err := loader.BackendLoader(opts...)
+	if err != nil {
+		return nil, err
+	}
+
+	if rerankModel == nil {
+		return nil, fmt.Errorf("could not load rerank model")
+	}
+
+	res, err := rerankModel.Rerank(context.Background(), request)
+
+	return res, err
+}
--- a/core/cli/cli.go
+++ b/core/cli/cli.go
@@ -0,0 +1,20 @@
+package cli
+
+import "embed"
+
+type Context struct {
+	Debug    bool    `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
+	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`
+
+	// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
+	BackendAssets embed.FS `kong:"-"`
+}
+
+var CLI struct {
+	Context `embed:""`
+
+	Run        RunCMD        `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
+	Models     ModelsCMD     `cmd:"" help:"Manage LocalAI models and definitions"`
+	TTS        TTSCMD        `cmd:"" help:"Convert text to speech"`
+	Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
+}
--- a/core/cli/models.go
+++ b/core/cli/models.go
@@ -0,0 +1,74 @@
+package cli
+
+import (
+	"encoding/json"
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/rs/zerolog/log"
+	"github.com/schollz/progressbar/v3"
+)
+
+type ModelsCMDFlags struct {
+	Galleries  string `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models"`
+	ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
+}
+
+type ModelsList struct {
+	ModelsCMDFlags `embed:""`
+}
+
+type ModelsInstall struct {
+	ModelArgs []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`
+
+	ModelsCMDFlags `embed:""`
+}
+
+type ModelsCMD struct {
+	List    ModelsList    `cmd:"" help:"List the models available in your galleries" default:"withargs"`
+	Install ModelsInstall `cmd:"" help:"Install a model from the gallery"`
+}
+
+func (ml *ModelsList) Run(ctx *Context) error {
+	var galleries []gallery.Gallery
+	if err := json.Unmarshal([]byte(ml.Galleries), &galleries); err != nil {
+		log.Error().Err(err).Msg("unable to load galleries")
+	}
+
+	models, err := gallery.AvailableGalleryModels(galleries, ml.ModelsPath)
+	if err != nil {
+		return err
+	}
+	for _, model := range models {
+		if model.Installed {
+			fmt.Printf(" * %s@%s (installed)\n", model.Gallery.Name, model.Name)
+		} else {
+			fmt.Printf(" - %s@%s\n", model.Gallery.Name, model.Name)
+		}
+	}
+	return nil
+}
+
+func (mi *ModelsInstall) Run(ctx *Context) error {
+	modelName := mi.ModelArgs[0]
+
+	var galleries []gallery.Gallery
+	if err := json.Unmarshal([]byte(mi.Galleries), &galleries); err != nil {
+		log.Error().Err(err).Msg("unable to load galleries")
+	}
+
+	progressBar := progressbar.NewOptions(
+		1000,
+		progressbar.OptionSetDescription(fmt.Sprintf("downloading model %s", modelName)),
+		progressbar.OptionShowBytes(false),
+		progressbar.OptionClearOnFinish(),
+	)
+	progressCallback := func(fileName string, current string, total string, percentage float64) {
+		progressBar.Set(int(percentage * 10))
+	}
+	err := gallery.InstallModelFromGallery(galleries, modelName, mi.ModelsPath, gallery.GalleryModel{}, progressCallback)
+	if err != nil {
+		return err
+	}
+	return nil
+}
--- a/Show More
+++ b/Show More